
    qi                       d dl mZ d dlZd dlZd dlmZmZmZmZ d dl	Z	d dl
Z
d dlmZ d dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z" erd dl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ dZ,e
jZ                  j]                  e/d      Z0 G d d      Z1 G d de2      Z3 G d de      Z4 G d de!      Z5 G d de"      Z6y)    )annotationsN)AnyOptionalTYPE_CHECKINGUnion)
OrderedSet)has_tpu_pallas   )configtorch_dtype_to_jax)get_fused_kernel_nameget_kernel_metadata)V   )BlockPatternMatcher)BackendFeatureCSEVariableIndentedBufferOpOverrides)pexpr
SIMDKernelSIMDScheduling)CallableSequence)IRNode)ReductionType)BaseSchedulerNodemainkernel_codec                  .    e Zd ZdZ	 d	 	 	 ddZdddZy)PallasKernelWrapperz6Wrapper to provide .run() interface for Pallas kernelsNc                L    || _         || _        t        j                  d|       y )NzPallas kernel path: %s)	kernel_fnkernel_pathkernel_code_loginfo)selfr$   r%   s      d/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/pallas.py__init__zPallasKernelWrapper.__init__(   s$     #&5{C    )streamc               *     | j                   |d|i|S )a5  
        Execute the Pallas kernel.

        Args:
            *args: Arguments to pass to the kernel function
            stream: CUDA stream to pass to the kernel function
            **kwargs: Additional keyword arguments for the kernel

        Returns:
            Result of the kernel execution
        r,   )r$   )r(   r,   argskwargss       r)   runzPallasKernelWrapper.run/   s     t~~t=F=f==r+   N)r$   zCallable[..., Any]r%   Optional[str])__name__
__module____qualname____doc__r*   r0    r+   r)   r"   r"   %   s.    @ KOD+D:GD !% >r+   r"   c                      e Zd ZdZy)UnsupportedzJException raised when an operation is not supported by the Pallas backend.N)r3   r4   r5   r6   r7   r+   r)   r9   r9   >   s    Tr+   r9   c                     e Zd ZdZedsd       Zedsd       Zedsd       Zedsd       Zedsd       Z	edsd       Z
edsd       Zedsd	       Zedsd
       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedsd       Zedtd       Zedtd       Z edtd       Z!edud       Z"e	 	 dv	 	 	 	 	 	 	 	 	 dwd!       Z#edxd"       Z$edyd#       Z%edzd$       Z&edsd%       Z'edsd&       Z(edsd'       Z)edsd(       Z*edsd)       Z+edsd*       Z,edtd+       Z-edtd,       Z.edtd-       Z/edtd.       Z0edtd/       Z1edsd0       Z2edsd1       Z3edsd2       Z4edtd3       Z5edtd4       Z6edtd5       Z7edsd6       Z8edtd7       Z9edtd8       Z:edtd9       Z;edtd:       Z<edtd;       Z=edtd<       Z>edtd=       Z?ed{d>       Z@ed{d?       ZAedsd@       ZBedsdA       ZCedsdB       ZDedsdC       ZEedsdD       ZFedsdE       ZGedsdF       ZHedsdG       ZIedsdH       ZJedsdI       ZKedsdJ       ZLedsdK       ZMedsdL       ZNedsdM       ZOedsdN       ZPedsdO       ZQed|dP       ZRed|dQ       ZSed|dR       ZTed|dS       ZUed|dT       ZVedsdU       ZWed|dV       ZXed|dW       ZYed|dX       ZZed}dY       Z[ed}dZ       Z\ed}d[       Z]ed}d\       Z^ed}d]       Z_ed}d^       Z`ed}d_       Zaed}d`       Zbed}da       Zced}db       Zded}dc       Zeed}dd       Zfedsde       Zgedsdf       Zhed~dg       Ziedtdh       Zjedtdi       Zkedtdj       Zledsdk       Zmedsdl       Znedtdm       Zoedtdn       Zpedtdo       Zqedsdp       Zredtdq       Zsedtdr       Zty )PallasKernelOverridesz
    Map element-wise ops to JAX/Pallas operations.

    For now, we use the default Python operators which are compatible
    with JAX numpy broadcasting semantics.
    c                    d|  dS )Nzjnp.sin()r7   xs    r)   sinzPallasKernelOverrides.sinJ       !Ar+   c                    d|  dS )Nzjnp.cos(r=   r7   r>   s    r)   coszPallasKernelOverrides.cosN   rA   r+   c                    d|  dS )Nzjnp.tan(r=   r7   r>   s    r)   tanzPallasKernelOverrides.tanR   rA   r+   c                    d|  dS )Nz	jnp.sinh(r=   r7   r>   s    r)   sinhzPallasKernelOverrides.sinhV       1#Qr+   c                    d|  dS )Nz	jnp.cosh(r=   r7   r>   s    r)   coshzPallasKernelOverrides.coshZ   rH   r+   c                    d|  dS )Nz	jnp.tanh(r=   r7   r>   s    r)   tanhzPallasKernelOverrides.tanh^   rH   r+   c                    d|  dS )Nzjnp.arcsin(r=   r7   r>   s    r)   asinzPallasKernelOverrides.asinb       QCq!!r+   c                    d|  dS )Nzjnp.arccos(r=   r7   r>   s    r)   acoszPallasKernelOverrides.acosf   rO   r+   c                    d|  dS )Nzjnp.arctan(r=   r7   r>   s    r)   atanzPallasKernelOverrides.atanj   rO   r+   c                    d|  dS )Nzjnp.exp(r=   r7   r>   s    r)   expzPallasKernelOverrides.expn   rA   r+   c                    d|  dS )Nz	jnp.exp2(r=   r7   r>   s    r)   exp2zPallasKernelOverrides.exp2r   rH   r+   c                    d|  dS )Nz
jnp.expm1(r=   r7   r>   s    r)   expm1zPallasKernelOverrides.expm1v       A3a  r+   c                    d|  dS )Nzjnp.log(r=   r7   r>   s    r)   logzPallasKernelOverrides.logz   rA   r+   c                    d|  dS )Nz
jnp.log10(r=   r7   r>   s    r)   log10zPallasKernelOverrides.log10~   rZ   r+   c                    d|  dS )Nz	jnp.log2(r=   r7   r>   s    r)   log2zPallasKernelOverrides.log2   rH   r+   c                    d|  dS )Nz
jnp.log1p(r=   r7   r>   s    r)   log1pzPallasKernelOverrides.log1p   rZ   r+   c                    d|  dS )Nz	jnp.sqrt(r=   r7   r>   s    r)   sqrtzPallasKernelOverrides.sqrt   rH   r+   c                    d|  dS )Nz(1.0 / jnp.sqrt())r7   r>   s    r)   rsqrtzPallasKernelOverrides.rsqrt   s    !!B''r+   c                    d|  dS )Nzjnp.abs(r=   r7   r>   s    r)   abszPallasKernelOverrides.abs   rA   r+   c                    d|  dS )Nz(-r=   r7   r>   s    r)   negzPallasKernelOverrides.neg   s    A3ayr+   c                    d|  dS )Nz
jnp.floor(r=   r7   r>   s    r)   floorzPallasKernelOverrides.floor   rZ   r+   c                    d|  dS )Nz	jnp.ceil(r=   r7   r>   s    r)   ceilzPallasKernelOverrides.ceil   rH   r+   c                    d|  dS )Nz
jnp.trunc(r=   r7   r>   s    r)   trunczPallasKernelOverrides.trunc   rZ   r+   c                    d|  dS )Nz
jnp.round(r=   r7   r>   s    r)   roundzPallasKernelOverrides.round   rZ   r+   c                    d|  dS )Nz(1.0 / (1.0 + jnp.exp(-z)))r7   r>   s    r)   sigmoidzPallasKernelOverrides.sigmoid   s    (3//r+   c                    d|  dS )Njnp.maximum(z, 0)r7   r>   s    r)   reluzPallasKernelOverrides.relu   s    aS%%r+   c                    d|  d| dS )Nz
jnp.power(, r=   r7   abs     r)   powzPallasKernelOverrides.pow       A3b1%%r+   c                    d|  d| dS )Nrw   rz   r=   r7   r{   s     r)   maximumzPallasKernelOverrides.maximum       aS1#Q''r+   c                    d|  d| dS )Nzjnp.minimum(rz   r=   r7   r{   s     r)   minimumzPallasKernelOverrides.minimum   r   r+   c                    d|  d| d| dS )N
jnp.where(rz   r=   r7   )condr|   r}   s      r)   wherezPallasKernelOverrides.where   s    D6A3b1--r+   Nc                *    t        |      }d|  d| dS )Nzjnp.asarray(	).astype(r=   r   )r?   dtype	src_dtypeuse_compute_types	jax_dtypes        r)   to_dtypezPallasKernelOverrides.to_dtype   s#     'u-	aS	)A66r+   c                F    t        |      }t        |      }d|  d| d| dS )z=Bitcast a value from one dtype to another with the same size.z)jax.lax.bitcast_convert_type(jnp.asarray(r   z), r=   r   )r?   r   r   r   jax_src_dtypes        r)   to_dtype_bitcastz&PallasKernelOverrides.to_dtype_bitcast   s7     'u-	*95:1#Y}oUXYbXccdeer+   c                F   ddl m} t        j                  j	                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  | ||             }t        j                  ||      S )z>Convert a sympy expression to a JAX array indexing expression.r
   )get_bounds_index_expr)bounds)utilsr   r   kernelkexprprepare_indexingcsegeneratecomputer;   r   )exprr   r   idx_strvars        r)   
index_exprz PallasKernelOverrides.index_expr   sq     	2((..!:!:4!@Ahhll##HHg.CD.I $ 
 %--c599r+   c                    t        |      }|t        j                  k(  r| rdS dS t        | t              r4t        j                  |       ryt        j                  |       r	| dkD  rdS dS d|  d| d	S )
z/Convert a constant value to JAX representation.TrueFalsezjnp.nanr   zjnp.infz-jnp.infz
jnp.array(z, dtype=r=   )r   torchbool
isinstancefloatmathisnanisinf)valr   r   s      r)   constantzPallasKernelOverrides.constant   sp     'u-	EJJ 6-g-c5!zz# zz#$'!Gy;;C5155r+   c                    d|  dS )Nz	jnp.real(r=   r7   r>   s    r)   realzPallasKernelOverrides.real   rH   r+   c                    d|  dS )Nz	jnp.imag(r=   r7   r>   s    r)   imagzPallasKernelOverrides.imag   rH   r+   c                    d|  dS )Nz	jnp.conj(r=   r7   r>   s    r)   conjzPallasKernelOverrides.conj   rH   r+   c                    d|  dS )Nz
jnp.angle(r=   r7   r>   s    r)   anglezPallasKernelOverrides.angle   rZ   r+   c                    d|  d|  dS )z8View complex tensor as real tensor with extra dimension.zjnp.stack([jnp.real(z), jnp.imag(z)], axis=-1)r7   r>   s    r)   view_as_realz"PallasKernelOverrides.view_as_real   s     &aSQC|DDr+   c                    d|  d|  dS )z#View real tensor as complex tensor.(z[..., 0] + 1j * z	[..., 1])r7   r>   s    r)   view_as_complexz%PallasKernelOverrides.view_as_complex  s     1#%aS	22r+   c                    d|  d| dS )Nr   z == r=   r7   r{   s     r)   eqzPallasKernelOverrides.eq	      1#T!Ar+   c                    d|  d| dS )Nr   z != r=   r7   r{   s     r)   nezPallasKernelOverrides.ne  r   r+   c                    d|  d| dS )Nr   z < r=   r7   r{   s     r)   ltzPallasKernelOverrides.lt      1#S1~r+   c                    d|  d| dS )Nr   z <= r=   r7   r{   s     r)   lezPallasKernelOverrides.le  r   r+   c                    d|  d| dS )Nr   z > r=   r7   r{   s     r)   gtzPallasKernelOverrides.gt  r   r+   c                    d|  dS )Nz
jnp.isnan(r=   r7   r>   s    r)   r   zPallasKernelOverrides.isnan  rZ   r+   c                    d|  dS )Nz
jnp.isinf(r=   r7   r>   s    r)   r   zPallasKernelOverrides.isinf!  rZ   r+   c                    d|  dS )Nzjnp.isfinite(r=   r7   r>   s    r)   isfinitezPallasKernelOverrides.isfinite%  s    qc##r+   c                    d|  d| dS )Nr   z >= r=   r7   r{   s     r)   gezPallasKernelOverrides.ge)  r   r+   c                    d|  d| dS )Nzjnp.logical_and(rz   r=   r7   r{   s     r)   logical_andz!PallasKernelOverrides.logical_and.      !!Bqc++r+   c                    d|  d| dS )Nzjnp.logical_or(rz   r=   r7   r{   s     r)   
logical_orz PallasKernelOverrides.logical_or2       2aS**r+   c                    d|  dS )Nzjnp.logical_not(r=   r7   r>   s    r)   logical_notz!PallasKernelOverrides.logical_not6      !!A&&r+   c                    d|  d| dS )Nzjnp.logical_xor(rz   r=   r7   r{   s     r)   logical_xorz!PallasKernelOverrides.logical_xor:  r   r+   c                    d|  d| dS )Nzjnp.arctan2(rz   r=   r7   r{   s     r)   atan2zPallasKernelOverrides.atan2?  r   r+   c                    d|  d| dS )Nz
jnp.hypot(rz   r=   r7   r{   s     r)   hypotzPallasKernelOverrides.hypotC  r   r+   c                    d|  d| dS )Nz	jnp.fmod(rz   r=   r7   r{   s     r)   fmodzPallasKernelOverrides.fmodG  s    1#Rs!$$r+   c                    d|  d| dS )Nzjnp.remainder(rz   r=   r7   r{   s     r)   	remainderzPallasKernelOverrides.remainderK      s"QCq))r+   c                &    d|  d| d|  d| d|  dS )Nz
(jnp.sign(z) * jnp.sign(z) * (jnp.abs(z) // jnp.abs(z))).astype(.dtype)r7   r{   s     r)   truncdivzPallasKernelOverrides.truncdivO  s.     A3mA3mA3mA3kZ[Y\\cddr+   c                    d|  d| dS )Nr   z // r=   r7   r{   s     r)   floordivzPallasKernelOverrides.floordivU  r   r+   c                    d|  d| d| dS Nz	jnp.clip(rz   r=   r7   r?   min_valmax_vals      r)   clampzPallasKernelOverrides.clampY      1#Ry7)155r+   c                    d|  d| d| dS r   r7   r   s      r)   clipzPallasKernelOverrides.clip]  r   r+   c                    d|  dS )Nz	jnp.sign(r=   r7   r>   s    r)   signzPallasKernelOverrides.signb  rH   r+   c                    d|  dS )Nzjnp.signbit(r=   r7   r>   s    r)   signbitzPallasKernelOverrides.signbitf  s    aS""r+   c                    d|  dS )Nzjax.scipy.special.erf(r=   r7   r>   s    r)   erfzPallasKernelOverrides.erfk  s    's!,,r+   c                    d|  dS )Nzjax.scipy.special.erfc(r=   r7   r>   s    r)   erfczPallasKernelOverrides.erfco  s    (1--r+   c                    d|  dS )Nzjax.scipy.special.erfinv(r=   r7   r>   s    r)   erfinvzPallasKernelOverrides.erfinvs  s    *1#Q//r+   c                    d|  dS )Nzjax.scipy.special.gammaln(r=   r7   r>   s    r)   lgammazPallasKernelOverrides.lgammaw      +A3a00r+   c                    d|  dS )Nzjax.scipy.special.digamma(r=   r7   r>   s    r)   digammazPallasKernelOverrides.digamma{  r   r+   c                    d|  d|  d|  dS )Nr   z>.astype(jnp.float64) == 0.0, 1.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=0)[0]).astype(r   r7   r>   s    r)   	bessel_j0zPallasKernelOverrides.bessel_j0  +      ++,# .c"	
r+   c                    d|  d|  d|  dS )Nr   z>.astype(jnp.float64) == 0.0, 0.0, jax.scipy.special.bessel_jn(z&.astype(jnp.float64), v=1)[1]).astype(r   r7   r>   s    r)   	bessel_j1zPallasKernelOverrides.bessel_j1  r   r+   c                    d|  d|  dS Njax.lax.bessel_i0e() * jnp.exp(jnp.abs(rf   r7   r>   s    r)   modified_bessel_i0z(PallasKernelOverrides.modified_bessel_i0       %QC';A3bAAr+   c                    d|  d|  dS Njax.lax.bessel_i1e(r  rf   r7   r>   s    r)   modified_bessel_i1z(PallasKernelOverrides.modified_bessel_i1  r  r+   c                    d|  d|  d|  dS )Nr   z == 0.0, 1.0, jnp.sin(z) / r=   r7   r>   s    r)   spherical_bessel_j0z)PallasKernelOverrides.spherical_bessel_j0  s      A34QCtA3a@@r+   c                    d|  d|  dS r  r7   r>   s    r)   i0zPallasKernelOverrides.i0       %QC';A3bAAr+   c                    d|  dS )Nr  r=   r7   r>   s    r)   i0ezPallasKernelOverrides.i0e       %QCq))r+   c                    d|  d|  dS r	  r7   r>   s    r)   i1zPallasKernelOverrides.i1  r  r+   c                    d|  dS )Nr
  r=   r7   r>   s    r)   i1ezPallasKernelOverrides.i1e  r  r+   c                    d|  d| dS Nzjax.scipy.special.gammainc(rz   r=   r7   r?   ys     r)   gammainczPallasKernelOverrides.gammainc  s     -QCr!A66r+   c                    d|  d| dS Nzjax.scipy.special.gammaincc(rz   r=   r7   r  s     r)   	gammaincczPallasKernelOverrides.gammaincc       .aS1#Q77r+   c                    d|  d| dS r  r7   r  s     r)   igammazPallasKernelOverrides.igamma  s     -QCr!A66r+   c                    d|  d| dS r  r7   r  s     r)   igammaczPallasKernelOverrides.igammac  r   r+   c                    d|  d| dS )Nzjax.scipy.special.polygamma(z.astype(jnp.int32), r=   r7   r  s     r)   	polygammazPallasKernelOverrides.polygamma  s     .aS0DQCqIIr+   c                    d|  dS )Nzjax.scipy.special.ndtri(r=   r7   r>   s    r)   ndtrizPallasKernelOverrides.ndtri  s     *!A..r+   c                    d|  d| dS )Nzjax.scipy.special.zeta(rz   r=   r7   r  s     r)   zetazPallasKernelOverrides.zeta  s     )2aS22r+   c                    d|  d| dS )Nzjax.scipy.special.xlogy(rz   r=   r7   r  s     r)   xlogyzPallasKernelOverrides.xlogy  s     *!Bqc33r+   c                    d|  d| dS )Nzjax.scipy.special.xlog1py(rz   r=   r7   r  s     r)   xlog1pyzPallasKernelOverrides.xlog1py  s     ,A3b155r+   c                >    d|  d| d|  d|  d| d|  d| d| d	|  d
S )Njnp.where(jnp.abs() <= 1, jnp.cos( * jnp.arccos(jnp.clip(, -1, 1))), jnp.where( > 1, jnp.cosh( * jnp.arccosh(jnp.maximum(, 1.0))), ((-1.0) ** ) * jnp.cosh( * jnp.arccosh(jnp.maximum(-
, 1.0)))))r7   r?   ns     r)   chebyshev_polynomial_tz,PallasKernelOverrides.chebyshev_polynomial_t  s[     ! $c0 4 s5aS 9M!,H:	W	
r+   c                    dj                  g d|  d| d|  d|  d|  d|  d| d	| d
|  d|  d|  d| d| d| d| d|  d|  d      S )N r0  ) < 1, jnp.sin(( + 1) * jnp.arccos(jnp.clip(z&, -1, 1))) / jnp.sqrt(jnp.maximum(1 - z**2, 1e-10)), jnp.where( >= 1, jnp.where( == 1,  + 1.0, jnp.sinh((  + 1) * jnp.arccosh(jnp.maximum( , 1.0))) / jnp.sqrt(jnp.maximum(**2 - 1, 1e-10))), jnp.where( == -1, ((-1.0) ** ) * ( + 1.0), ((-1.0) ** ) * jnp.sinh((! + 1) * jnp.arccosh(jnp.maximum(-**2 - 1, 1e-10)))))joinr:  s     r)   chebyshev_polynomial_uz,PallasKernelOverrides.chebyshev_polynomial_u  s   	; 	;  	; 	; $ 	;s	;6	;78c	;:(	;()s	;+	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r+   c                   dj                  g d| d|  d| d|  d| d|  d|  d| d	|  d
|  d|  d| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )Nr>  r    == 0, jnp.ones_like(), jnp.where(	 == 1, 2* - 1, jnp.where(	 == 2, 4***2 - 2*	 == 3, 8***3 - 4***2 - 4* + 1, jnp.where(
 == 4, 16***4 - 8*	**3 - 12***2 + 4*
 == 5, 32*	**5 - 16*	**4 - 32*	**3 + 12***2 + 6* - 1, jnp.zeros_like()))))))rM  r:  s     r)   chebyshev_polynomial_vz,PallasKernelOverrides.chebyshev_polynomial_v  
   ) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r+   c                   dj                  g d| d|  d| d|  d| d|  d|  d	| d
|  d|  d|  d	| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )Nr>  r   rQ  rR  rS  rZ  rU  **2 + 2*rT  rW  **3 + 4*rY  r[  **4 + 8*r]  r_  	**5 + 16*ra  rc   + 1, jnp.zeros_like(re  rM  r:  s     r)   chebyshev_polynomial_wz,PallasKernelOverrides.chebyshev_polynomial_w  rg  r+   c                J    d|  d}d| d| d| d| d| d| d	| d
| d| dS )N(2 *  - 1)r0  r1  r2  r3  r4  r5  r6  r7  r8  r9  r7   r?   r;  r  s      r)   shifted_chebyshev_polynomial_tz4PallasKernelOverrides.shifted_chebyshev_polynomial_t/  si    
 A3e  $c0 4 s5aS 9M!,H:	W	
r+   c                    d|  d}dj                  g d| d| d| d| d| d	| d
| d| d| d| d| d| d| d| d| d| d| d      S )Nrp  rq  r>  r0  r?  r@  z', -1, 1))) / jnp.sqrt(jnp.maximum(1 - (z)**2, 1e-10)), jnp.where(rA  rB  rC  rD  rE  rF  rG  rH  rI  rJ  rK  rL  rM  rr  s      r)   shifted_chebyshev_polynomial_uz4PallasKernelOverrides.shifted_chebyshev_polynomial_u=  s    A3e	; 	;  	; 	; $ 	;s	;6	;78c	;:)	;)*	;,	; 	;	; 		; #		; $%#		;&		;
 	;
 <	;
 =>3	;
?$	; %&3	;'	; 	; /	; 01c	; 27	; 89c	;:	; 	; +	; ,-#	; .O	; PQc	;R$	; %&3	; ':	;	
r+   c                   d|  d}dj                  g d| d|  d| d| d| d	| d
| d| d| d| d| d| d| d| d| d| d| d| d| d| d| d| d|  d      S )Nrp  rq  r>  r   rQ  rR  rS  rT  rU  rV  rW  rX  rY  rZ  r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rM  rr  s      r)   shifted_chebyshev_polynomial_vz4PallasKernelOverrides.shifted_chebyshev_polynomial_vO      A3e) )j ) )0 ) ) 4 ))$)%&C)())$)%&C)'/)01s)3) ) %) &'C) (0) 12s) 3;) <=#)>) 	) &	) '(S	) )1	) 23	) 4=	) >?C	) @H	) IJs	)K	)
 )
 &)
 '(S)
 )2)
 34)
 5>)
 ?@S)
 AJ)
 KL)
 MU)
 VWTW)
X)  S) !()	
r+   c                   d|  d}dj                  g d| d|  d| d| d| d	| d
| d| d| d| d| d| d| d| d| d| d| d| d| d| d| d| d|  d      S )Nrp  rq  r>  r   rQ  rR  rS  rZ  rU  ri  rT  rW  rj  rY  r[  rk  r]  r_  rl  ra  rc  rm  re  rM  rr  s      r)   shifted_chebyshev_polynomial_wz4PallasKernelOverrides.shifted_chebyshev_polynomial_w]  rx  r+   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )Nr>  r   rQ  rR  z == 1, 2 * , jnp.where(z == 2, 4 * z**2 - 2, jnp.where(z == 3, 8 * z**3 - 12 * z == 4, 16 * z**4 - 48 * z**2 + 12, jnp.where(z == 5, 32 * z**5 - 160 * z**3 + 120 * , jnp.zeros_like(re  rM  r:  s     r)   hermite_polynomial_hz*PallasKernelOverrides.hermite_polynomial_hk  s~   ) )j ) )0 ) ) 4 ))&)'(c)*))&)'(c)*) ) ') ()c) *5) 67C)8) 	) (	) )*s	) +6	) 78S	)9	)
 )
 ()
 )*s)
 +7)
 89c)
 :F)
 GHS)
I)  S) !()	
r+   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )Nr>  r   rQ  rR  rB  r|  z == 2, z**2 - 1, jnp.where(z == 3, 
**3 - 3 * z == 4, z
**4 - 6 * z**2 + 3, jnp.where(z == 5, z**5 - 10 * **3 + 15 * r}  re  rM  r:  s     r)   hermite_polynomial_hez+PallasKernelOverrides.hermite_polynomial_he}  s}   
) )j ) )0 ) ) 4 ))")#$#)&))")#$#)&) ) #) $%#) &0) 12s)3) 	) #	) $%#	) &0	) 12s	)3	)
 )
 #)
 $%#)
 &1)
 23)
 4?)
 @Ac)
B)  S) !()	
r+   c                   dj                  g d| d|  d| d|  d| d|  d|  d	| d
|  d|  d|  d| d|  d|  d|  d|  d| d|  d|  d|  d|  d|  d|  d      S )Nr>  r   rQ  rR  z == 1, 1 - r|  z == 2, (rY  z + 2) / 2, jnp.where(z	 == 3, (-z**3 + 9*z	**2 - 18*z + 6) / 6, jnp.where(z == 4, (z	**4 - 16*z	**3 + 72*z	**2 - 96*z + 24) / 24, jnp.where(z	 == 5, (-z	**5 + 25*z
**4 - 200*z
**3 + 600*z
**2 - 600*z + 120) / 120, jnp.zeros_like(re  rM  r:  s     r)   laguerre_polynomial_lz+PallasKernelOverrides.laguerre_polynomial_l  s	   
) )j ) )0 ) ) 4 ))&)'(c)*))#)$%3)&.)/0c)2) ) %) &'C) (0) 12s) 3<) =>3)?) 	) $	) %&3	) '0	) 12s	) 3<	) =>3	) ?H	) IJs	)K	)
 )
 %)
 &'C)
 (1)
 23)
 4>)
 ?@S)
 AK)
 LM#)
 NX)
 YZWZ)
[)  S) !()	
r+   c                    dj                  g d| d|  d| d|  d| d|  d| d	|  d
|  d| d|  d|  d| d|  d|  d|  d|  d      S )Nr>  r   rQ  rR  rB  r|  z == 2, (3 * z**2 - 1) / 2, jnp.where(z == 3, (5 * r  z) / 2, jnp.where(z == 4, (35 * z**4 - 30 * z**2 + 3) / 8, jnp.where(z == 5, (63 * z**5 - 70 * r  z) / 8, jnp.zeros_like(re  rM  r:  s     r)   legendre_polynomial_pz+PallasKernelOverrides.legendre_polynomial_p  s~   
) )j ) )0 ) ) 4 ))")#$#)&))')()s)+) ) () )*s) +5) 67C)8) 	) )	) *+	) ,7	) 89c	):	)
 )
 ))
 *+)
 ,7)
 89c)
 :E)
 FGC)
H)  S) !()	
r+   c                    d|  dS )Nzjnp.reciprocal(r=   r7   r>   s    r)   
reciprocalz PallasKernelOverrides.reciprocal  s     1%%r+   c                    d|  dS )Nzjnp.square(r=   r7   r>   s    r)   squarezPallasKernelOverrides.square  rO   r+   c                    d|  d| d| dS )zFused multiply-add: a * b + czjnp.fma(rz   r=   r7   )r|   r}   cs      r)   fmazPallasKernelOverrides.fma  s     !BqcA3a((r+   c                    d|  d| dS )Nzjnp.copysign(rz   r=   r7   r{   s     r)   copysignzPallasKernelOverrides.copysign  s    qcA3a((r+   c                    d|  d| dS )Nzjnp.nextafter(rz   r=   r7   r{   s     r)   	nextafterzPallasKernelOverrides.nextafter  r   r+   c                    d|  d| dS )Nz
jnp.ldexp(rz   r=   r7   r{   s     r)   ldexpzPallasKernelOverrides.ldexp  r   r+   c                    d|  dS )Nz
jnp.frexp(r=   r7   r>   s    r)   frexpzPallasKernelOverrides.frexp  rZ   r+   c                    d|  dS )Nz	jnp.modf(r=   r7   r>   s    r)   modfzPallasKernelOverrides.modf  rH   r+   c                    d|  d| dS )Nzjnp.bitwise_and(rz   r=   r7   r{   s     r)   bitwise_andz!PallasKernelOverrides.bitwise_and  r   r+   c                    d|  d| dS )Nzjnp.bitwise_or(rz   r=   r7   r{   s     r)   
bitwise_orz PallasKernelOverrides.bitwise_or  r   r+   c                    d|  d| dS )Nzjnp.bitwise_xor(rz   r=   r7   r{   s     r)   bitwise_xorz!PallasKernelOverrides.bitwise_xor  r   r+   c                    d|  dS )Nzjnp.bitwise_not(r=   r7   r>   s    r)   bitwise_notz!PallasKernelOverrides.bitwise_not  r   r+   c                    d|  d| dS )Nzjnp.left_shift(rz   r=   r7   r{   s     r)   
left_shiftz PallasKernelOverrides.left_shift  r   r+   c                    d|  d| dS )Nzjnp.right_shift(rz   r=   r7   r{   s     r)   right_shiftz!PallasKernelOverrides.right_shift  r   r+   )r?   strreturnr  )r|   r  r}   r  r  r  )r   r  r|   r  r}   r  r  r  )NT)
r?   r  r   torch.dtyper   zOptional[torch.dtype]r   r   r  r  )r?   r  r   r  r   r  r  r  )r   
sympy.Exprr   r  r  r  )r   r  r  r  )r?   r  r   r  r   r  r  r  )r?   r  r  r  r  r  )r?   r  r;  r  r  r  )r|   r  r}   r  r  r  r  r  )ur3   r4   r5   r6   staticmethodr@   rC   rE   rG   rJ   rL   rN   rQ   rS   rU   rW   rY   r\   r^   r`   rb   rd   rg   ri   rk   rm   ro   rq   rs   ru   rx   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r"  r$  r&  r(  r*  r,  r.  r<  rO  rf  rn  rs  ru  rw  rz  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r7   r+   r)   r;   r;   B   s	                      " " " " " "       ! !   ! !     ! !     ( (     ! !     ! ! ! ! 0 0 & & & & ( ( ( ( . .  ,0"&	777 )7  	7
 
7 7 f f : : 6 6             ! ! E E 3 3
           ! ! ! ! $ $   , , + + ' ' , , ( ( & & % % * * e e
   6 6 6 6     # # - - . . 0 0 1 1 1 1 	
 	
 	
 	
 B B
 B B
 A A
 B B * * B B * * 7 7
 8 8 7 7 8 8 J J
 / / 3 3 4 4 6 6 
 
 
 
( 
 
 
 
 
 
 
 
" 
 
 
 
 
 
" 
 
 
 
 
 
 & & " " ) ) ) ) * * & & ! !     , , + + , , ' ' + + , ,r+   r;   c                      e Zd ZU dZeZeZded<    fdZ		 	 	 	 	 	 	 	 	 	 ddZ
ddZddZddZdd	Zdd
ZddZddZddZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZedd       Zdd dZdd!dZ xZS )"PallasKernelad  
    Pallas kernel for elementwise operations with support for strided/scatter access.

    Strategy:
    - Convert index expressions to JAX-compatible array slicing
    - Load/store using indexed access: "in_ptrX[slice]" or full-array "in_ptrX[...]"
    - Compute expression with Python operators (compatible with jax.numpy broadcasting)
    - Generate Python code that defines a Pallas kernel and a host entrypoint.
    - Use async_compile.pallas path to compile and load Python code.

    For GPU (Triton backend):
    - Use masked loads/stores with power-of-2 block sizes to handle non-power-of-2 shapes
    zCallable[[sympy.Expr], str]r   c                    t        |   |i | t        j                  j	                         }|j
                  dk(  | _        d | _        i | _        g | _	        i | _
        y )Ncuda)superr*   r   graphget_current_device_or_throwtypeis_gpuuse_masked_opstensor_masksstore_with_outputload_index_exprs)r(   r.   r/   device	__class__s       r)   r*   zPallasKernel.__init__  sW    $)&)446kkV++/8:79r+   c                     y)z)Check array bounds for indirect indexing.Nr7   )r(   r   sizeloweruppers        r)   check_boundszPallasKernel.check_bounds  s    r+   c                    | j                  |      }t        |t        j                        ry|j                  rt        |      S | j                  |      S )a  
        Convert an index expression to a string suitable for Pallas indexing.

        Pallas operates on full arrays, so we need to convert index expressions
        to JAX array slicing. For example:
        - x0 -> "..." (contiguous access, full array)
        - 2*x0 -> "::2" (strided access with stride 2)
        - 2*x0 + 1 -> "1::2" (strided access with offset 1, stride 2)

        Args:
            index: The indexing expression to convert

        Returns:
            The indexing string to use in generated code
        ...)r   r   sympySymbol
is_Integerr  _convert_to_jax_slice)r(   indexprepared_indexs      r)   _get_index_strzPallasKernel._get_index_str	  sN    " ..u5 nell3&&~&& --n==r+   c                   | j                   syt        j                  j                  j	                  |      }|j
                  }t        | j                  j                               }||z  }t        |      dk(  rt        |      S t        |      dk(  rt        t        |            }t        j                  ||      }t        j                  ||      }|}||z
  }t        j                  j                  j	                  |      }|dk(  r|dk(  ry|dk(  r| j!                  |      }	d|	 S | j!                  |      }
| j!                  |      }	|
 d|	 S ||z
  }t        j                  j                  j	                  |      }|dk(  rV||k(  rQyt        |      dkD  rBd}|D ]7  }t        j                  ||      }t        j                  ||      }|dk7  s5d} n |ryy| j#                  |      S )aD  
        Convert a sympy index expression to JAX slice notation.

        Handles common patterns like:
        - stride*var -> ::stride
        - stride*var + offset -> offset::stride

        For more complex patterns, falls back to explicit indexing.
        Uses BlockPatternMatcher for robust pattern matching.
        r  r   r   z::TF)range_treesr   r  sizevarssimplifyfree_symbolsr   range_tree_nodeskeyslenr  nextiterr   get_subexpr_involving_symbolmatch_affine_block_exprr   _generate_index_array)r(   r  r  	iter_vars	used_varsr   var_exprstrideoffset
stride_str
offset_strall_unit_strides               r)   r  z"PallasKernel._convert_to_jax_slice(  s       ))%0)) t4499;<	 !9,	y>Qu:^q tI'C +GGsSH )@@3OF!)))226: Q;6Q; q[!%F!3J
|,, "&F!3J!%F!3J(\J<88 )))226:Q;8s? ^a #O  .KKESVW,DDXsSQ;&+O 
  ))%00r+   c                    t        d|       )z
        Generate JAX code to compute an index array for complex indexing patterns.

        For very complex patterns that can't be expressed as simple slices,
        we need to compute the indices explicitly. This is not yet fully implemented.
        z>Pallas backend does not yet support complex indexing pattern: )r9   )r(   r  s     r)   r  z"PallasKernel._generate_index_array~  s     LUGT
 	
r+   c                |    |j                   }t        | j                  j                               }t	        ||z        S )zFCheck if index expression contains iteration variables (x0, x1, etc.).)r  r   r  r  r   )r(   r  r  r  s       r)   _has_iteration_varsz PallasKernel._has_iteration_vars  s6    ))t4499;<	L9,--r+   c                b    |j                   }|D ]  }t        |      j                  d      s y y)zICheck if index expression contains indirect variables (tmp0, tmp1, etc.).tmpTF)r  r  
startswith)r(   r  r  syms       r)   _has_indirect_varszPallasKernel._has_indirect_vars  s6    )) 	C3x""5)	 r+   c                    | j                  |      }| j                  |      }|r|r| j                  |      dfS |r| j                  |      dfS | j	                  |      dfS )a  
        Get the index expression string and whether it needs flattening.

        Returns:
            Tuple of (index_str, needs_flatten) where needs_flatten indicates
            if the buffer should be flattened before indexing (for mixed indexing).
        TF)r  r  _handle_mixed_indexingr   r  )r(   r  has_indirecthas_iter_varss       r)   _get_index_exprzPallasKernel._get_index_expr  sl     ..u5007M..u5t;;::e$e++&&u-u44r+   c                "   | j                   syt               }|j                  | j                  j                  j                                |j                  | j                  j                  j                                |j                  t        j                  j                  j                                g }|D ]  }	 t        j                  j                  |      }|j                         }t        d |D              }d}|D ]"  }t        |d      r|t        |      z  }||z  }$ |j                  |||f        |rAt#        |      dkD  r3t%        d |D              }	|	sy|d   d   t%        fd|D              }
|
S y# t         $ r Y w xY w)	aB  
        Determine if we should use masked ops for this entire kernel.

        Masked ops with pl.ds(block_size) flatten tensors to 1D, which works when:
        1. We're on GPU (CUDA backend uses Triton which requires power-of-2 sizes)
        2. All tensors are already 1D (so flattening doesn't change dimensionality)
        3. All tensors have the same size (so broadcasting works correctly)

        With per-tensor masks, each tensor gets its own mask based on its size.

        This should be called once in codegen_kernel() before generating the kernel body.
        Fc              3  N   K   | ]  }t        |d       rt        |      n|  yw__int__Nhasattrint.0ss     r)   	<genexpr>z@PallasKernel._determine_masked_ops_for_kernel.<locals>.<genexpr>  s"     S9(=c!f1DS   #%r   r  r   c              3  @   K   | ]  \  }}}t        |      d k(    ywr   N)r  )r  _shapes      r)   r  z@PallasKernel._determine_masked_ops_for_kernel.<locals>.<genexpr>  s     E[QqUqEs   r
   c              3  0   K   | ]  \  }}}|k(    y wr1   r7   )r  r  r  
first_sizes      r)   r  z@PallasKernel._determine_masked_ops_for_kernel.<locals>.<genexpr>  s     Nzq!T
 2Ns   )r  r   updater.   input_buffersr  output_buffersr   r  name_to_buffer
get_bufferget_sizetupler  r  append	Exceptionr  all)r(   all_buffer_namesbuf_infobuf_namebufr  r  
total_sizer  all_1dall_same_sizer  s              @r)    _determine_masked_ops_for_kernelz-PallasKernel._determine_masked_ops_for_kernel  st    {{ &< 			 7 7 < < >?		 8 8 = = ?@ 6 6 ; ; => ( 	Hgg((2||~SdSS
 (Aq),"c!f,
"a
	(
 5* =>	* H)EHEEF "!QJNXNNM  '  s   >A>F	FFc                d    || j                   vrd| }|| j                   |<   | j                   |   S )z2Get or create a unique mask variable for a buffer.mask_)r  )r(   r  mask_vars      r)   _get_or_create_maskz PallasKernel._get_or_create_mask  s=    4,,,xj)H*2Dh'  **r+   c                   | j                   j                  |      }t        j                  j	                  |      }|| j
                  |<   | j                  | j                         | _        | j                  |      \  }}|dk(  xr | xr | j                  }|r| j                  |      }d| d| d}	n|r	| d| d}	n| d| d}	| j                  j                  | j                  |	|      S )	Nr  zpltriton.load(z.at[pl.ds(block_size)], mask=r=   z[...].flatten()[][r   )r.   inputr   r  	get_dtyper  r  r  r  r  r   r   r   )
r(   namer  r  r   	index_strneeds_flatten
use_maskedr  	load_exprs
             r)   loadzPallasKernel.load  s	   iiood#!!$' ',d# &"&"G"G"ID#'#7#7#> 	= %'U,=U$BUBU
//5H(-J8*TUVI%/	{!<I %q1-Ixx  LL ! 
 	
r+   c                b   t        | j                  j                               }j                  }||z  }t	        |      dk(  r| j                        S fd}t        ||d      }|D cg c]
  } ||       }}| j                        }	|D 
cg c]  }
t        |
      j                  d      s|
! }}
|D cg c]  }t        |       }}|D 
ci c]  }
t        |
       ||
       }}
g }|D ]  }|j                   ||      d|f        |D ]  }|j                   ||      d|f        |j                  d d       t        |      D ]  \  }}t        |      }|| j                  v s | j                  |   }|j                  } ||      d	| j                  |       d
}t        fd|D              }t        fd|j                         D              }||z   }|dkD  rd|z  }| d| d}|	j                  ||      }	 |D ]  }||   t        fd|D              }t        fd|D              }|dkD  r|dkD  rd|z  }d|z  }| d| d| d}n(|dkD  rd|z  }| d| d}n|dkD  rd|z  }| d| d}n|}|	j                  ||      }	 |	S c c}w c c}
w c c}w c c}
w )a  
        Handle indexing with both indirect variables and iteration variables.

        For example, x[indices, :] generates index = i0 + stride * tmp0
        where tmp0 is loaded from indices and i0 is the iteration variable.

        We need to convert this to JAX advanced indexing with proper broadcasting.
        When there are multiple iteration variables, they need different shapes
        to form an outer product (grid) rather than broadcasting together.
        r   c                    j                  |       }|dk(  rt        j                  |       }	 t        |      S # t        t
        f$ r Y yw xY w)z>Extract the coefficient of a variable in the index expression.r   )coeffr  diffr  	TypeError
ValueError)r   r  r  s     r)   get_coefficientz<PallasKernel._handle_mixed_indexing.<locals>.get_coefficient'  sN    KK$Ez

5#.5z!z* s   
: AAT)keyreverser  r  indirectc                    | d   S Nr   r7   r>   s    r)   <lambda>z5PallasKernel._handle_mixed_indexing.<locals>.<lambda>D  s
    !A$ r+   zjnp.arange(r=   c              3  .   K   | ]  }|k  s	d   ywr  r7   r  r  	var_coeffs     r)   r  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>[  s     %NAIa%N   
c              3  .   K   | ]  }|k  s	d   yw)r
   Nr7   r*  s     r)   r  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>\  s      *1y=A*r,  z, Nonez[:r  c              3  .   K   | ]  }|kD  s	d   ywr  r7   r  r  indirect_coeffs     r)   r  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>l  s     I!a.6HAIr,  c              3  .   K   | ]  }|k  s	d   ywr  r7   r/  s     r)   r  z6PallasKernel._handle_mixed_indexing.<locals>.<genexpr>m  s     J1q>7IQJr,  zNone, r  r  z...]z[...)r   r  r  r  r  r   sortedr  r  r  sort	enumeratelengthsumvaluesreplace) r(   r  r  r  used_iter_vars_setr"  used_iter_varsr   iter_coeffsr  r  indirect_var_symsr  indirect_varsindirect_coeffsall_componentsivar_namerange_entry
range_sizearange_exprn_trailing_itern_trailing_indirect
n_trailingtrailing_dimsindirect_var	n_leadingleading_nonestrailing_nonesreshape_exprr0  r+  s     `                            @@r)   r  z#PallasKernel._handle_mixed_indexing  s<    t4499;<	)))I5!"a'::e$$
	   2QUV7EFs+FFJJu%	(4Q1A8I8I%8PQQQ->?cS?? @QQ!3q6?1#55QQ ! 	GC!!?3#7"EF	G$ 	KC!!?3#7S"IJ	K=  / 	EFAs3xHd+++"33C8(//
+C0	 +DJJz,B+C1E #&%N%N"N&) *.557* '# -/BB
>$,z$9M%0MM?!"DK%--hD	1	E6 * 	FL,\:N I{IIIJJJJ 1}a (9 4!)J!6".qs>BRRSTQ (9 4".qtDa!)J!6".tN3C1E+!)),EI+	F. W G R? Rs   .JJ"4J">J'J,c                   |t        d      | j                  j                  |      }| j                  j	                  |       | j
                  | j                         | _        	 t        j                  j                  |      }|j                         }t        |      dk(  }|r| d| }	ng| j                  |      \  }
}|
dk(  xr | xr | j
                  }|r| j                  |      }d| d| d	| d
}	n|
dk(  r| d| d| d}	n
| d|
 d| }	| j                  j!                  |	       | j"                  j%                  ||	f       y # t        $ r d}d}Y w xY w)Nzpallas store mode not supportedr   r7   Fz[...] = r  zpltriton.store(z.at[pl.ds(block_size)], z, mask=r=   	.reshape(.shape)r  z] = )r9   r.   outputstore_buffer_namesaddr  r  r   r  r  r  r  r  r  r  stores	writeliner  r  )r(   r  r  valuemodeoutr  output_shape	is_scalar
store_exprr  r  r  r  s                 r)   storezPallasKernel.store  s    ?@@iit$##D) &"&"G"G"ID	''$$T*C<<>LL)Q.I
 50J'+';';E'B$I} U"P='8PT=P=P  33D9.se3KE7RYZbYccde
e# !$uHUG9SEI
 !$uAi[UG<
j)%%sJ&78E  	LI	s   &=E EEc                     j                   sJ t        |t              rt        d      |||f}| j                  j
                  v r j                  j
                  |   S dddddddd	}t        g d
      }t         fd|D              }g }	t         j                  j                         d       D ]F  \  }
}|j                  j                  d      r"	 |	j                  t        |j                               H d}|D ]/  }| j$                  v s j$                  |   }	 |t        |      z  }1 d} j$                  D ]2  }|j                  d      s j$                  |   }	 |t        |      z  }4 t'        d  j                  j                         D              }|dk(  r|r|r|rd| d| d}nd| d}n|dv r}||   }|xr	 |xr |dkD  }|r_|dkD  rYd} j(                  rt+        t-         j(                  j/                                     } j                  j                         D 
cg c]"  \  }
}|j                  j                  d      r|
$ }}
}|r|d   }|j1                  |      }	 |dk7  rt        |      nd} j                  j                         D 
cg c]"  \  }
}|j                  j                  d      s|
$ }}
}|r2|d   }|j1                  |      }	 |dk7  rt        |      nd}||kD  rdnd}|dk(  r| d| d| d}nt        t3        |            }| d| d| d}np| d| d}ng||v r:|r,|r*|r(|	r&t5        |	      }||   }d| d| d| d| d| d | d}n5||    d| d}n)t        d!| d"t7        |j9                                d#       j                  j;                   j<                  ||$      } |  j                  j
                  |<   | S # t         t"        f$ r d}	Y  w xY w# t         t"        f$ r d}Y  w xY w# t         t"        f$ r d}Y  w xY wc c}}
w # t         t"        f$ r d}Y w xY wc c}}
w # t         t"        f$ r d}Y pw xY w)%aJ  
        Generate code for reduction operations in JAX/Pallas.

        Reductions in Pallas work by:
        1. Loading the input data into the kernel
        2. Applying JAX reduction operations (jnp.sum, jnp.max, etc.)
        3. Storing the reduced result

        The reduction happens over the loaded block of data.
        zHTuple reductions (e.g., welford_combine) not supported in Pallas backendzjnp.sumzjnp.prodzjnp.maxzjnp.minzjnp.anyz
jnp.argmaxz
jnp.argmin)r6  prodmaxminanyargmaxargmin)r?   r  zc              3  :   K   | ]  }|j                   v   y wr1   )numels)r  pr(   s     r)   r  z)PallasKernel.reduction.<locals>.<genexpr>  s     IA,Is   c                    t        | d         S r'  r  r>   s    r)   r(  z(PallasKernel.reduction.<locals>.<lambda>  s    QqT r+   )r#  rNr   c              3  \   K   | ]$  \  }}|j                   j                  d       rd & yw)rj  r   N)prefixr  )r  r   entrys      r)   r  z)PallasKernel.reduction.<locals>.<genexpr>  s.      
U||&&s+ 
s   *,xor_sumzjnp.bitwise_xor.reduce(rO  z, -1), axis=-1)r=   )rb  rc  r   r   z, axis=z(lambda v: (lambda pw_sizes: z(v.reshape(-1, z>), axis=-1) if v.ndim == 2 else (lambda input_shape, pw_axes: z=(jnp.moveaxis(v, pw_axes, list(range(len(pw_axes)))).reshape(z`, -1), axis=-1))(v.shape, [i for i, s in enumerate(v.shape) if s in pw_sizes][:len(pw_sizes)]))(z))(zReduction type 'z8' not yet supported in Pallas backend. Supported types: z	, xor_sumr  )inside_reductionr   r   r9   r   reduction_cacher   ra  r2  r  itemsrl  r  r  r  r5  r   r!  rf  r6  r  r  r  r7  r  ranger  listr  r   r   )!r(   r   r   reduction_typerV  	cache_keyreduction_opspointwise_prefixeshas_pointwisepointwise_sizesr   rm  pointwise_numelrg  numelreduction_numeln_reduction_dimsreduction_exprreduction_opis_partial_reductionreduction_axis
load_indexreduction_varsr_varr_coeffr_stridepw_varspw_varpw_coeff	pw_strideaxespw_sizes_strresults!   `                                r)   	reductionzPallasKernel.reduction  sP   " $$$$eU#Z 
 6	00088++I66 ""
 (8I6HII  !!'')/B
 	JC <<**3/#**3u||+<=	 # 	ADKKA#s5z1O		  	A||C A#s5z1O		  
"3399;
 
 Y&_#:5'?J[[j!k#:5'!C33 )8L I/Io6I ! $(81(< "#((!%d4+@+@+G+G+I&J!KJ +/*?*?*E*E*G&&C <<2237 &N &
 & .q 1","2"25"9)7>!|s7|H /3.C.C.I.I.K# *U#(<<#:#:3#?  # #
 #%,QZF'1'7'7'?H.=E]CMPQ	
 3;Y2FQAN#q((4~QugW^DTTU%VN !'7!89D(4~QugWTF!%LN %1>5'!;},###  #?3,^<3#nOO3D E5 $n %SSbRc d
 &c%
3  %2.$A#B!E7!!L">"2 3$$(););)=$>#?yJ  ""LL # 
 /5  +_ ":. &*O ":. &*O ":. &*OD& !*:6 )'(H)# %.z#: .,-	.sf   $O*#P&P6'P59P; ('Q+Q *P ?P PPP21P2;QQQ.-Q.c                    t         j                  j                  |       }|j                         }|j	                         S r1   )r   r  r  
get_layoutis_contiguous)buffer_namer  layouts      r)   _buffer_is_contiguousz"PallasKernel._buffer_is_contiguous}  s1    gg  -!##%%r+   c                  A t               }| j                  j                         \  }}}}|D cg c]  }|j                   }}|D cg c]  }|j	                  d      s| }}|D cg c]  }|j	                  d      s| }	}|	st        d      | j                  j                  j                         D 
ci c]  \  }
}t        |t              r||
 }}
}|xs d}t        j                  j                         j                  dk(  }t        j                  j                   j"                  }|rDt        j                  j                   j$                  st        d      t'               st        d      |rdnd	}d
| j(                  rdndz   dz   |sdndz   | j(                  rdndz   }|j+                  |d       i }|D ]4  }|j-                  |      }|duxr | j/                  |      }| xr |||<   6 |D cg c]  }||   s	| d }}|D cg c]  }|j	                  d      s| }}||z   }||z   }t1        |j                         D cg c]
  \  }}|r	| c}}      }t3        |	      D cg c]  \  }}||v s| }}}|| _        d| ddj7                  |       | j(                  rdndz   dz   }|j9                  |       |j;                         5  | j(                  r6| j<                  r)i }| j                  j>                  j                         D ]  \  }
}t        |t              r|n|
||
<    | j                  j                  j                         D ]  \  }
}t        |t              r|n|
||
<    tA        | j<                  j                               D ]{  \  } }!|j-                  | |       }"d}#|D ]  }|"|k(  s| t        |      v s|}# n |#s:|j9                  d|         |j9                  |! d|# d       |j9                  |! d|! d       } | jB                  rS| j(                  sF|j9                  d       d}$d}%|	ri|	d    }&|j-                  |&      }'|'rQ	 t        j                  jE                  |'      }(|(jG                         })tI        d! |)D              }$d"}%|$D ]  }*|%|*z  }%	 	 | jB                  j                         D ]  \  }+},t        |+      }-|,jL                  }.| jO                  |.      }/	 tQ        |.d#      rtS        |.      nd}0|0K|$rFtY        |$      d"kD  r8|0|%k(  r3dj7                  d$ |$D              }1|j9                  |- d%|/ d&|1 d'       |j9                  |- d%|/ d'        | jZ                  j\                  D ]  }2|j9                  t        |2              | j^                  D ]  \  }3}4|3|v s|j9                  |4        	 ddd       | d(}5g }6t3        |      D ]/  \  }}||v s|j	                  d)      s|6ja                  |d*z          1 |6rd+dj7                  d, |6D              z   d-z   }7nd.}7|j9                  d/|7 d'       |j9                  d|5 d0dj7                  |       d       |j;                         5  |j9                  d1       |j9                  d2       |j9                  d3       |j9                  d'       | j(                  r|j9                  d4       |j9                  d5       |j9                  d6       |D ]  }|j9                  d7| d8        |j9                  d9       |j9                  d:       |j9                  d;       |j9                  d<       g }8t3        |	      D ]w  \  }9}|j	                  d      r=|j-                  |d=      s*| d}:|jc                  |:      };|8ja                  |;|9f       T|jc                  |      };|8ja                  |;|9f       y dj7                  d> |8D              }<| j(                  rd?| d@n| dA}=|j9                  dB       |j9                  dC|=z          |j9                  dD       |j9                  dE| dF       |j9                  dG       |j9                  |8rdH|< dIndJ       |j9                  dK       |r$|j9                  dCdj7                  |       dF       |j9                  d'       ddd       | dL}>|j9                  d|> d+dj7                  |       dM       |j;                         5  |j9                  dN       |j9                  dO       |rI|j9                  dP       |D ]3  }:|r|j9                  |: dQ|: dR       |j9                  |: dS|: dT       5 |j9                  dU       |D ]E  }?|?j	                  d)      s|r|j9                  |? dQ|? dR       /|j9                  |? dS|? dT       G |j9                  dV       |D ]E  }?|?j	                  dW      s|r|j9                  |? dQ|? dR       /|j9                  |? dS|? dX       G |j9                  dY       |j9                  dZdj7                  |	D cg c]  }d[| d\
 c}      z   d-z          |j9                  d]dj7                  |	D cg c]  }d^| d_
 c}      z   d-z          i A|D ]
  }:|: d`A|:<    |D ]
  }?|? d`A|?<    |r3dj7                  Afda|D              }@|j9                  db|5 d0|@ d'       n|j9                  db|5 dc       |r`|j9                  dd       |D ]J  }|	|   }|r*|j9                  de| df       |j9                  | dg       4|j9                  | dh| di       L ddd       |je                         S c c}w c c}w c c}w c c}}
w c c}w c c}w c c}}w c c}}w # tJ        $ r Y <w xY w# tT        tV        f$ r d}0Y w xY w# 1 sw Y   4xY w# 1 sw Y   /xY wc c}w c c}w # 1 sw Y   |je                         S xY w)ja  
        Generate the complete Pallas kernel code as a Python string.

        This includes:
        - Import statements for JAX/Pallas
        - The kernel function that operates on refs
        - The main wrapper function that handles PyTorch<->JAX conversions via DLPack

        Args:
            name: Optional kernel name (will use placeholder if not provided)

        Returns:
            str: Complete Python source code for the Pallas kernel
        out_ptr)r  
in_out_ptrz2Pallas backend requires at least one output buffer<KERNEL_NAME>cpuzBPallas backend currently only supports using the first JAX device.zPALLAS_TARGET_TPU is set, but no TPU device was found. Please make sure that you have a TPU available and that JAX is configured correctly.r   r   z*
            import functools
            zimport math
            r>  zimport torch
            import jax
            import jax.numpy as jnp
            from jax.experimental import pallas as pl
            from torch._inductor.runtime.runtime_utils import torch_dtype_to_jax_runtime
            zC
            from jax.experimental.pallas import triton as pltritonzN
            from torch._inductor.runtime.runtime_utils import next_power_of_2TstripN_alias)r  in_ptrzdef z_kernel(rz   z, *, block_sizez):z# Mask for z_size = z.sizez = jnp.arange(block_size) < _sizez*# Define iteration variables as JAX arraysr   c              3  N   K   | ]  }t        |d       rt        |      n|  ywr  r  r  s     r)   r  z.PallasKernel.codegen_kernel.<locals>.<genexpr>  s'      7KL'!Y*?AQ F7r  r   r  c              3  2   K   | ]  }t        |        y wr1   ri  r  s     r)   r  z.PallasKernel.codegen_kernel.<locals>.<genexpr>6  s     -Qc!f-Q   z = jnp.arange(z
).reshape(r=   _jit_wrapperr  r
   r   c              3  2   K   | ]  }t        |        y wr1   ri  )r  r?   s     r)   r  z.PallasKernel.codegen_kernel.<locals>.<genexpr>M  s     ,LSV,Lr  z,)z()zB@functools.partial(jax.jit, static_argnums=(0, 1), donate_argnums=z(out_shapes, out_dtypes, zout_specs = tuple(z&    jax.ShapeDtypeStruct(shape, dtype)z3    for shape, dtype in zip(out_shapes, out_dtypes)z<# Calculate block_size as next power of 2 for Triton backendz0# Find maximum flattened size across all tensorszmax_size = 0zmax_size = max(max_size, z.size)zfor shape in out_shapes:zC    tensor_size = shape[0] if len(shape) == 1 else math.prod(shape)z)    max_size = max(max_size, tensor_size)z&block_size = next_power_of_2(max_size)Fc              3  0   K   | ]  \  }}| d |   yw)z: Nr7   )r  r@  os      r)   r  z.PallasKernel.codegen_kernel.<locals>.<genexpr>z  s     )P&1aQCr!+)Ps   zfunctools.partial(z _kernel, block_size=block_size),z_kernel,zreturn pl.pallas_call(z    z    out_shape=out_specs,z    interpret=,z    grid=(1,),z    input_output_aliases={ z },z    input_output_aliases={},z)(_mainz, stream=None):z/# Enable JAX x64 mode for float64/int64 supportz)jax.config.update('jax_enable_x64', True)z*# Convert Torch -> JAX for donated outputsz_jax = jax.device_put(z-.cpu().numpy(), device=jax.devices('tpu')[0])z_jax = jax.dlpack.from_dlpack(z
.detach())z+# Convert Torch -> JAX for in-place tensorsz!# Convert Torch -> JAX for inputsr  z.detach().contiguous())z-# Prepare output metadata from PyTorch tensorzout_shapes = (ztuple(rP  zout_dtypes = (ztorch_dtype_to_jax_runtime(r   _jaxc              3  (   K   | ]	  }|     y wr1   r7   )r  r  arg_name_maps     r)   r  z.PallasKernel.codegen_kernel.<locals>.<genexpr>  s      ++/L&+s   zres = z(out_shapes, out_dtypes)z9result_values = res if isinstance(res, tuple) else (res,)z'res_cpu = jax.device_get(result_values[z])z".copy_(torch.from_dlpack(res_cpu))z'.copy_(torch.from_dlpack(result_values[z])))3r   r.   python_argdefsr  r  RuntimeErrorr  rq  r   r  r   r  r  r  r   	_inductorr   _debug_cpu_to_tpu_pallas!pallas_take_first_jax_device_onlyr	   r  splicegetr  r   r4  aliasable_out_ptrsrN  rU  indentr  r  r2  r  r  r  r   r  r5  r   r  r  r   r!  r  r   _linesr  r  r  getvalue)Br(   r  codearg_defsr  r|   kernel_paramsrg  pure_out_paramsoutput_paramsouterinneroutput_buffer_lookupkernel_nameinterpret_is_cpuis_tpuinterpret_literalimportsaliasable_flagsparamr  r  alias_paramspointer_tailkernel_input_paramsfull_kernel_paramsflagnon_alias_out_setidxcopy_output_indiceskernel_signaturebuf_to_paramr  r  
param_namematching_paramfirst_output_shapefirst_output_numelfirst_out_paramfirst_out_buf_namer  r  r  var_symrm  rA  r5  
length_str
length_val	shape_strliner  
store_linejit_wrapper_namedonate_indicesdonate_literalalias_pairsout_idx
alias_name	input_idxalias_map_literal
kernel_arg	main_nameptralias_args_strr  sB                                                                    @r)   codegen_kernelzPallasKernel.codegen_kernel  s     !II446!Q)12A22&3Oq||I7N1OO$
5N(OA
 
 STT !%		 8 8 > > @ 
u%% 5L 
  
 -o77>>@EEN''@@??))KK"X  "#"k  '7FG
 /3.A.A*rK		 ( W" && b% 	, 	G4(+-$ 	NE.2259K't3 8R8R9M +;&:%MOE"	N +:
!&_U=SugV
 
 %
5M(NA
 
 +\9)M9&$3$9$9$;HjdD4TH
 "+=!9
CTEV=VC
 
 #2 ;-x		2D(E'FG$($7$7 RA 	
 	'([[] Z	/""t'8'8!$(II$;$;$A$A$C ULE53=eS3I%uL'U$(II$<$<$B$B$D ULE53=eS3I%uL'U +11B1B1H1H1J*K &Hh!-!1!1(H!EJ%)N* "%?h#a&.@-.N!	" &XJ'?@(8N;K5'QR'j(DXJeT, $$T-@-@KL%)"%)" &3A&6O)=)A)A/)R&)
!"#''"4"45G"HC#&<<>D16 7PT7 2. 23.%7 8 2a 7 28
 '+&;&;&A&A&C QNGU"7|H"\\F!%F!3J*4;FI4NS[TX
 ")  + 23a7&*<<$(II-Q>P-Q$Q	'jzl*YKWXY (>*Q'OP5Q: ++ *s4y)*
 (,'='= /#00NN:./qZ	/x *],7"#67 	/IC$)F%%cAg.	/  499,L^,L#LLtSN!N>a!	

 	#$$=diiH[>\=]]_`	
 [[] 8	 NN/0NNCDNNPQNN3 ""R QR~.0 NENN%>ugV#LMN 9:Y JKGH13K!*=!9 =??9-&**47(,vV_
$7$=$=j$I	#**Iw+?@ 3 9 9$ ?I&&	7';<= !%		)PK)P P
 && %[M1QR#}H- 
 NN34NN6J./NN56NN^,=+>a@ANN+,NN //@.AF3
 NN4 "dii0C&D%EQGHNN3q8	 t #m5)	9+Qtyy);<=_M	
 [[] V	NNLMNNFGKL". J
 )l*@Lyz )l*HT^_ NNHI# 	>>,/"e#9#>kl "e#A#jQ	 NN>?# 	>>(+"e#9#>kl "e#A#F]^	 NNJKNN ))NvdV73NOP
 NN )) %2  6dV7C 	 ,.L* ?
.8\+>Z(?# 1'*e4LS!1 #!% +3F+ " -..GGWWXY (8'99QRS"O / 
C(-DEcU"M $/Q'RS#f$KC5PST
YV	p }}q
 3O
 
l

 I
~  ) ! ! &z2 *%)
*}Z	/ Z	/Z8	  8	 T OeV	p }}s  ooo)o o:o;
ooo*o
o 
o 
7o&o&C7ppBp,Ao,;A	po<B/ppDp!8D<p!8Bp8Ap8)Ap8p.+p8 p3Cp8,	o95p8o99p<ppppp!p+.
p88qc                   t         j                  j                  }| j                  j	                         \  }}}}|D cg c]  }|j
                   }}|D 	cg c]  }	|	j                  d      s|	 }
}	t        t        t        |            }t        | di       }|
D 	cg c](  }	|j                  |	d      r||j                  |	         * }}	| ddj                  ||z          d}|j                  |       yc c}w c c}	w c c}	w )z7Generate the Python code that calls this Pallas kernel.r  r  Fz.run(rz   r=   N)r   r  wrapper_coder.   r  r  r  rs  mapr  getattrr  r  rN  rU  )r(   r  nodewrapperr  	call_argsr  r|   kernel_param_namesrg  r  call_arg_strs	aliasablealias_call_argskernel_calls                  r)   call_kernelzPallasKernel.call_kernel  s    ''&&$(II$<$<$>!)Q.67aff77&8TALL<S1TTSi01D"6;	 %
}}Q& ,22156
 
 eDIIo.M$N#OqQ+& 8T
s   C9C>.C>-D)
r   r  r  r  r  r   r  r   r  None)r  r  r  r  )r  r  r  r   )r  r  r  ztuple[str, bool])r  r   )r  r  r  r  )r  r  r  r  r  r   r1   )
r  r  r  r  rV  r   rW  r   r  r  )
r   r  r   r  rt  r   rV  +Union[CSEVariable, tuple[CSEVariable, ...]]r  r  )r  r  r  r   )r  r2   r  r  )r  r  r  zOptional[IRNode]r  r  )r3   r4   r5   r6   r;   	overridesr   r   __annotations__r*   r  r  r  r  r  r  r  r  r  r  r  r\  r  r  r  r  r  __classcell__)r  s   @r)   r  r    s    &I).E&.
:88&089=8FJ8	8>>T1l
.5$=~+
Bm` MQ4949 *493>49FI49	49lDD D &	D
 ;D 
5DL & &
kZ' 'r+   r  c                  :    e Zd ZeZedd       Z	 	 	 	 	 	 	 	 ddZy)PallasSchedulingc                6    t        t        j                  g      S r1   )r   r   REDUCE_TO_SINGLE_ELEMENT)clsr  s     r)   get_backend_featuresz%PallasScheduling.get_backend_features  s     >BBCDDr+   c                   t         j                  j                  }||j                  v r|j                  |   S t        j
                  j                  r$t        |t        j
                  j                        nd}t        j                  |j                  d            j                         d d }|dk(  rd| }nd| d| }||j                  |<   |j                  d|      }t               }|j                  d|d	       |j                  |d
       |j                  d       t!        ||      \  }	}
|	 d|
 }|j#                  ||j%                         |       |S )Nr>  zutf-8   fusedpallas_r  r  zasync_compile.pallas(z, r'''Tr  z''')
)r   r  r  src_to_kernelr   tritondescriptive_namesr   hashlibsha256encode	hexdigestr8  r   rU  r  r   define_kernelr  )r(   src_codenode_scheduler   r  
fused_namekernel_hashr  compile_wrapperoriginsdetailed_originsmetadata_comments               r)   r  zPallasScheduling.define_kernel  sZ    ''&&w,,,((22 }}.. "-1P1PQ 	
 nnX__W%=>HHJ2AN #K=1K#J<q>K*5h' ##O[A(*!!$9+"OPxt4!!&)$7w$O!!%Yb)9(:;k?+C+C+EGWXr+   N)r  ztorch.devicer  zOrderedSet[BackendFeature])r  r  r  zSequence[BaseSchedulerNode]r   r  r  r  )r3   r4   r5   r  kernel_typeclassmethodr  r  r7   r+   r)   r  r    sF    KE E
"" 3" 	"
 
"r+   r  )7
__future__r   r  r   typingr   r   r   r   r  r   torch.utils._ordered_setr   torch.utils._pallasr	   r>  r   runtime.runtime_utilsr   r   r   r   virtualizedr   block_analysisr   commonr   r   r   r   simdr   r   r   collections.abcr   r   irr   ops_handlerr   	schedulerr   MAIN_SUFFIX_logginggetArtifactLoggerr3   r&   r"   r  r9   r;   r  r  r7   r+   r)   <module>r'     s    "   6 6   / .  6 >  / L L 3 3 2+-  ..228]K> >2U, U_
,K _
,D^': ^'B!+~ +r+   