
    qi+                       d dl mZ d dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ d dlZd dlmZ d dlZd dlmZ d dlmZ d dlmZmZ d d	lmZ d
dlmZmZmZ d
dlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, erd dlm-Z- d
dl.m/Z/m0Z0 d
dl1m2Z2m3Z3 ddl"m4Z4  ejj                  e6      Z7ejp                  dejr                  dejt                  dejv                  dejx                  dejz                  dej|                  dej~                  dej                  di	ZAd'dZB G d de      ZC G d  d!e'      ZDeDj                  d"       eDj                           G d# d$e+      ZG G d% d&e,      ZHy)(    )annotationsN)Path)AnyOptionalTYPE_CHECKING)
PRECEDENCE)_embed_headers)
OrderedSet)
CppPrinterExprPrinter)ValueRanges   )ceildivget_bounds_index_exprget_kernel_metadata)ops
OpsWrapperV   )CSEVariableDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferOpOverridesPythonPrinter)IterationRangesEntry
SIMDKernelSIMDScheduling)Union)ReductionType	StoreMode)	SchedulerSchedulerNode)OpVarTboolcharshortintlongucharfloathalfbfloatc                    t        | t              r:| t        j                  k(  ry| t        j                   k(  ry| | k7  ryt	        |       S t        | t
              r| rdS dS t	        |       S )N	HUGE_VALFz
-HUGE_VALFNANtruefalse)
isinstancer+   torchinfstrr%   )vals    a/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/mps.pyvalue_to_metalr9   8   s_    #u%))UYYJCZ3x	C	v)')s8O    c                      e Zd ZdZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZddZeZddZddZddZy)MetalExprPrinterz/Converts sympy expression to Metal code snippetc                    |j                   \  }}| j                  |      }| j                  |      }|j                  r	d| d| dS d| d| dS )Nc10::metal::floor_divide(, )metal::floor() / (argsdoprint
is_integer)selfexprxdivs       r8   _print_FloorDivz MetalExprPrinter._print_FloorDivI   s[    3LLOll3??.qcC5::qcse1--r:   c                    |j                   \  }}}| j                  |      }|dk7  r0| j                  |      }|j                  r
d| d| d}n	d| d| d}| j                  |      }d| d| dS )Nr   (rB   r@   rA   z) % (rC   )rG   rH   rI   rJ   mods        r8   _print_ModularIndexingz'MetalExprPrinter._print_ModularIndexingQ   s    ii3LLO!8,,s#Cs%uA&#A3eC52ll31#U3%q!!r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::min only supported for 2 argsstatic_cast<decltype(+)>(r@   zmetal::min(r?   lenrD   RuntimeErrormap_printrG   rH   ab
typecast_a
typecast_bs         r8   
_print_MinzMetalExprPrinter._print_Min]       tyy>QEFF4;;		*1,QCq3qc;
,QCq3qc;
ZL:,a88r:   c                    t        |j                        dk7  rt        d      t        | j                  |j                        \  }}d| d| d| d}d| d| d| d}d| d| dS )	Nr   z$metal::max only supported for 2 argsrQ   rR   rS   r@   zmetal::max(r?   rT   rY   s         r8   
_print_MaxzMetalExprPrinter._print_Maxe   r_   r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   metal::abs(r   r@   rU   rD   rX   rG   rH   s     r8   
_print_AbszMetalExprPrinter._print_Absm   s9    499~"""T[[167q99r:   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )Nr   zstatic_cast<long>(metal::rint(r   ))rd   re   s     r8   _print_RoundToIntz"MetalExprPrinter._print_RoundToIntq   s9    499~"""/DIIaL0I/J"MMr:   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  d	S )
Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .Mulz!static_cast<float>(metal::rint(1e * z) * 1er@   )rU   rD   rF   
ValueErrorparenthesizer   )rG   rH   numberndigits
number_strs        r8   _print_RoundDecimalz$MetalExprPrinter._print_RoundDecimalu   s    499~"""))Q;;abiajjkl  &&vz%/@A
27)3zl&RYQYPZZ[\\r:   c                n    |j                   \  }}d| j                  |       d| j                  |       dS )Nstatic_cast<float>(z) / static_cast<float>(r@   )rD   rX   )rG   rH   lhsrhss       r8   _print_IntTrueDivz"MetalExprPrinter._print_IntTrueDiv   s;    99S$T[[%5$66MdkkZ]N^M__`aar:   c                    t        |j                        dk(  sJ t        | j                  |j                        \  }}d| d| dS )Nr   zmetal::pow(static_cast<float>(z), static_cast<float>(rh   )rU   rD   rW   rE   )rG   rH   rI   ys       r8   _print_PowByNaturalz$MetalExprPrinter._print_PowByNatural   sF    499~"""4<<+1/s2H2NNr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   ru   r@   rU   rD   rE   rG   rH   rI   s      r8   _print_ToFloatzMetalExprPrinter._print_ToFloat   s=    499~"""LL1&$QCq))r:   c                X    |j                   rt        t        |            S t        |      S N)rF   r6   r(   re   s     r8   _print_FloatzMetalExprPrinter._print_Float   s#    ?? s4y>!t9r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   z1static_cast<int>(metal::floor(static_cast<float>(z)))r}   r~   s      r8   _print_FloorToIntz"MetalExprPrinter._print_FloorToInt   s=    499~"""LL1&B1#SIIr:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zstatic_cast<int>(metal::trunc(rh   r}   r~   s      r8   _print_TruncToIntz"MetalExprPrinter._print_TruncToInt   s=    499~"""LL1&/s"55r:   c                ~    t        |j                        dk(  sJ | j                  |j                  d         }d| dS )Nr   r   zmetal::log2(r@   r}   r~   s      r8   _print_OpaqueUnaryFn_log2z*MetalExprPrinter._print_OpaqueUnaryFn_log2   s=    499~"""LL1&aS""r:   c                J      fd|j                   D        \  }}}| d| d| S )Nc              3  V   K   | ]   }j                  |t        d    dz
         " yw)Atomg      ?N)ro   r   ).0argrG   s     r8   	<genexpr>z0MetalExprPrinter._print_Where.<locals>.<genexpr>   s-      
ADDc:f#5#;<
s   &) ?  : )rD   )rG   rH   cpqs   `    r8   _print_WherezMetalExprPrinter._print_Where   s5    
HL		
1a Cs#aS!!r:   N)rH   
sympy.Exprreturnr6   )__name__
__module____qualname____doc__rK   rO   r^   ra   rf   ri   rs   rx   r{   r   r   r   _print_floorr   r   r    r:   r8   r<   r<   F   s\    9.
"99:N
]b
O
*
J
 %L6
#
"r:   r<   c                  b   e Zd ZdZe	 	 d0	 	 	 	 	 	 	 	 	 d1d       Ze	 	 	 	 	 	 	 	 d2d       Zed3d       Zed4d       Zed5d       Z	ed6d       Z
ed7d	       Zed8d
       Zed8d       Zed8d       Zed8d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed9d       Zed:d       Zed9d       Zed9d       Zed9d       Z ed9d       Z!ed9d        Z"ed8d!       Z#ed9d"       Z$ed9d#       Z%ed8d$       Z&ed9d%       Z'ed8d&       Z(ed9d'       Z)ed;d(       Z*ed;d)       Z+e	 	 	 	 	 	 	 	 	 	 d<d*       Z,ed9d+       Z-ed8d,       Z.d=d-Z/d>d.Z0e1d?d/       Z2y)@MetalOverrideszXImplements Metal-specific overrides for ops. Base class emits Python-friendly overrides.Nc                ~    |t         j                  k(  rt        j                  d       d|  dS dt        |    d|  dS )Nz>float64 cast requested, probably from tensorify_python_scalarsru   r@   static_cast<>()r4   doublelogwarningDTYPE_TO_METAL)rI   dtype	src_dtypeuse_compute_typess       r8   to_dtypezMetalOverrides.to_dtype   sK     ELL KKP )1--nU34Bqc;;r:   c                6    dt         |    dt         |    d|  dS )Nzas_type<z>(static_cast<r   rh   r   )rI   r   r   s      r8   to_dtype_bitcastzMetalOverrides.to_dtype_bitcast   s/     ./0~i?X>YY[\][^^`aar:   c                    t        |       S r   r9   )r7   r   s     r8   constantzMetalOverrides.constant   s    c""r:   c                @   t         j                  j                  t         j                  j                  |             }t         j                  j                  j                  t         j                  j                  |t        |             }t        j                  ||      S )N)bounds)
r   kernelindex_to_strprepare_indexingcsegeneratecomputer   r   r   )rH   r   idx_strvars       r8   
index_exprzMetalOverrides.index_expr   sl    ((''(A(A$(GHhhll##HHg.CD.I $ 
 ||C''r:   c                    t         j                  j                  | |      5 } |       }d d d        j                  j                  rt        |      }t        j                  ||      S # 1 sw Y   AxY wr   )r   r   
mask_loadsr   is_boolr%   r   where)maskbodyothernew_maskresults        r8   maskedzMetalOverrides.masked   sa     XX  u- 	VF	 ==  KEyy6511	 	s   A))A2c                (    |  d| dt        |       S )Nr   r   r   )rZ   r[   r   s      r8   r   zMetalOverrides.where   s    Cs#nQ/011r:   c                    d|  d| dS )Nzc10::metal::remainder(r?   r@   r   rZ   r[   s     r8   	remainderzMetalOverrides.remainder   s    's"QCq11r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::max(r?   r   rZ   r[   r\   r]   s       r8   maximumzMetalOverrides.maximum   K    ,QCq3qc;
,QCq3qc;
!*R
|1==r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zc10::metal::min(r?   r   r   s       r8   minimumzMetalOverrides.minimum   r   r:   c                    |  d| S )Nz || r   r   s     r8   
logical_orzMetalOverrides.logical_or       D}r:   c                    |  d| S )Nz && r   r   s     r8   logical_andzMetalOverrides.logical_and   r   r:   c                    d|  dS )Nzmetal::isnan(r@   r   rI   s    r8   isnanzMetalOverrides.isnan       qc##r:   c                    d|  dS )Nzmetal::isinf(r@   r   r   s    r8   isinfzMetalOverrides.isinf   r   r:   c                    d|  dS )Nzmetal::log(r@   r   r   s    r8   r   zMetalOverrides.log      QCq!!r:   c                    d|  dS )Nzmetal::exp(r@   r   r   s    r8   expzMetalOverrides.exp  r   r:   c                    d|  dS )Nrc   r@   r   r   s    r8   abszMetalOverrides.abs  r   r:   c                    d|  dS )Nzmetal::signbit(r@   r   r   s    r8   signbitzMetalOverrides.signbit  s     1%%r:   c                    d|  dS )Nzmetal::precise::sin(r@   r   r   s    r8   sinzMetalOverrides.sin      %aS**r:   c                    d|  dS )Nzc10::metal::sinc(r@   r   r   s    r8   sinczMetalOverrides.sinc  s    "1#Q''r:   c                    d|  dS )Nzmetal::precise::cos(r@   r   r   s    r8   coszMetalOverrides.cos  r   r:   c                    d|  dS )Nzmetal::tan(r@   r   r   s    r8   tanzMetalOverrides.tan  r   r:   c                    d|  dS )Nzmetal::asin(r@   r   r   s    r8   asinzMetalOverrides.asin#      aS""r:   c                    d|  dS )Nzmetal::acos(r@   r   r   s    r8   acoszMetalOverrides.acos'  r   r:   c                    d|  dS )Nzmetal::atan(r@   r   r   s    r8   atanzMetalOverrides.atan+  r   r:   c                    d|  d| dS )Nz::metal::atan2(r?   r@   r   )rI   rz   s     r8   atan2zMetalOverrides.atan2/  s     2aS**r:   c                    d|  dS )Nzmetal::sqrt(r@   r   r   s    r8   sqrtzMetalOverrides.sqrt3  r   r:   c                    d|  d|  dS )NrQ   z)>(-r@   r   r   s    r8   negzMetalOverrides.neg7  s     'qcaS22r:   c                    d|  dS )Nzmetal::rsqrt(r@   r   r   s    r8   rsqrtzMetalOverrides.rsqrt=  r   r:   c                    d|  dS )Nzmetal::tanh(r@   r   r   s    r8   tanhzMetalOverrides.tanhA  r   r:   c                    d|  dS )Nzmetal::atanh(r@   r   r   s    r8   atanhzMetalOverrides.atanhE  r   r:   c                    d|  d| dS )Nr>   r?   r@   r   r   s     r8   floordivzMetalOverrides.floordivI  s     +1#Rs!44r:   c                    d|  dS )NrA   r@   r   r   s    r8   floorzMetalOverrides.floorN  r   r:   c                    d|  dS )Nzmetal::sign(r@   r   r   s    r8   signzMetalOverrides.signR  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::fmod(r?   r   r   s       r8   fmodzMetalOverrides.fmodV  sK    ,QCq3qc;
,QCq3qc;
j\J<q99r:   c                    d|  dS )Nmetal::trunc(r@   r   r   s    r8   trunczMetalOverrides.trunc\  r   r:   c                    |  d| }| j                   | j                   j                  s"|j                   |j                   j                  rd| dS |S )Nz / r  r@   )r   is_floating_point)rZ   r[   quots      r8   truncdivzMetalOverrides.truncdiv`  sQ    Cs|GGAGG$=$=GGAGG$=$="4&**r:   c                    d|  dS )Nzmetal::ceil(r@   r   r   s    r8   ceilzMetalOverrides.ceili  r   r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nrandomzc10::metal::rand(r?   r@   r   r   headersaddseedoffsets     r8   randzMetalOverrides.randm  s/    	X&"4&6(!44r:   c                f    t         j                  j                  j                  d       d|  d| dS )Nr  zc10::metal::randn(r?   r@   r  r  s     r8   randnzMetalOverrides.randnr  s/    	X&#D6F8155r:   c           	     r    t         j                  j                  j                  d       d|  d| d| d| d	S )Nr  zc10::metal::randint64(r?   r@   r  )r  r  lowhighs       r8   	randint64zMetalOverrides.randint64w  s=     	
X&'vRxr#baHHr:   c                    d|  dS )Nzmetal::rint(r@   r   r   s    r8   roundzMetalOverrides.round~  r   r:   c                D    d|  d| d|  d}d|  d| d| d}d| d| dS )NrQ   rR   rS   r@   zmetal::pow(r?   r   )rZ   r[   cast_acast_bs       r8   powzMetalOverrides.pow  sK    (1QCs1#Q7(1QCs1#Q7VHBvha00r:   c                f    t         j                  j                  j                  d       d| d| dS )Nspecial_mathc10::metal::rM   r@   r  )rG   rZ   names      r8   _special_unaryzMetalOverrides._special_unary  s/    	^,dV1QCq))r:   c                l    t         j                  j                  j                  d       d| d| d| dS )Nr%  r&  rM   r?   r@   r  )rG   rZ   r[   r'  s       r8   _special_binaryzMetalOverrides._special_binary  s5    	^,dV1QCr!A..r:   c           
        dD ].  }t        | |t        j                  | j                  |             0 t        j                  | j                  d      | _        dD ]1  }t        | |t        j                  | j                  |dz                3 dD ].  }t        | |t        j                  | j
                  |             0 dD ]1  }t        | |t        j                  | j
                  |dz                3 y )N)erferfinvi0i0ei1i1edigammaspherical_bessel_j0)r'  	log_gamma)
	bessel_j0	bessel_j1	bessel_y0	bessel_y1modified_bessel_i0modified_bessel_i1modified_bessel_k0modified_bessel_k1scaled_modified_bessel_k0scaled_modified_bessel_k1_forward)	polygammaigammaigammaczeta)
chebyshev_polynomial_tchebyshev_polynomial_uchebyshev_polynomial_vchebyshev_polynomial_whermite_polynomial_hhermite_polynomial_heshifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_w)setattr	functoolspartialmethodr(  lgammar*  )clsr'  s     r8   _initialize_special_opsz&MetalOverrides._initialize_special_ops  s    	
 
	WD Cy66s7I7IPTUV
	W ,,S-?-?kR

 	D ''(:(:
ARS	&
 	XD Cy66s7J7JQUVW	X
 	D ''(;(;$BST	r:   NT)
rI   r   r   torch.dtyper   zOptional[torch.dtype]r   r%   r   r6   )rI   r   r   rU  r   rU  r   r6   )r7   zUnion[bool, float, int]r   rU  r   r6   )rH   r   r   rU  r   r6   )r   r   r   r   r   r   r   r6   )rZ   r$   r[   r$   r   r$   r   r6   )rZ   r$   r[   r$   r   r6   )rZ   r   r[   r   r   r6   )rI   r   r   r6   )rI   r   rz   r   r   r6   )r  r   r  r   r   r6   )
r  r   r  r   r  r   r  r   r   r6   )rZ   r   r'  r6   r   r6   )rZ   r   r[   r   r'  r6   r   r6   r   None)3r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r  r  r#  r(  r*  classmethodrS  r   r:   r8   r   r      s   b ,0"&	<<< )<  	<
 
< < bb*b7Bb	b b
 # # ( ( 2 2 2 2 2 2 > >
 > >
     $ $ $ $ " " " " " " & & + + ( ( + + " " # # # # # # + + # # 3 3
 $ $ # # $ $ 5 5 $ $ # # : :
 $ $   # # 5 5 6 6 II#.I5@IHSI	I I # # 1 1
*/ = =r:   r   mpsc                      e Zd ZU dZeZdZdZdZdZ	 e
       j                  Z e       j                  Z e       j                  ZeZ edg      Zded<   g Zd	ed
<   	 	 	 	 	 	 d fdZddZddZ	 d	 	 	 	 	 	 	 	 	 ddZddZddd ej8                         f	 	 	 	 	 	 	 	 	 	 	 d dZ	 	 	 	 	 	 	 	 	 	 d!dZ	 	 	 	 	 	 	 	 	 	 d!dZd"dZ d#dZ!dd$dZ"	 d%	 	 	 	 	 	 	 d&dZ#	 	 	 	 	 	 	 	 	 	 d'dZ$ xZ%S )(MetalKernelz;Implement Metal codegen based on the SIMDKernel abstraction;auto i       utilszOrderedSet[str]r  zlist[IterationRangesEntry]multistage_reduction_entryc                X    t        |   |fi | t        j                         | _        y r   )super__init__	itertoolscountacc_var_ids)rG   tilingkwargs	__class__s      r8   rd  zMetalKernel.__init__  s&    
 	*6*$??,r:   c                    t         |   S r   r   )rG   r   s     r8   dtype_to_strzMetalKernel.dtype_to_str  s    e$$r:   c                   | j                   j                  |      }| j                  |      }t        j                  j                  |      }| d| j                  |       d}|t        j                  t        j                  fv rd| d}t        j                  }| j                  j                  | j                  ||      S )z"Codegen a load from an InputBuffer[]ru   r@   r   )rD   inputr   r   graph	get_dtyper   r4   float16bfloat16float32r   r   loads)rG   r'  indexr   r   lines         r8   loadzMetalKernel.load  s    iiood#%%e,!!$'a))%013U]]ENN33 )a0DMMExx  T ??r:   Nc                h   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }d| d| d}|| d| j                  |       d| d}n[|dk(  rH| j                  j                  d       d	| d
}	d|	 d| d}
|	 d|
 d| j                  |       d| d}nt        d|       | j                  r&| j                  j                  t        ||             y | j                  j                  t        ||             y )Nr   r   r@   rn  ] = r]  
atomic_addatomiczc10::metal::AtomicType<>zreinterpret_cast<device z
::type *>(z::atomic_add(r?   );zUnimplemented store mode )rD   outputr   rl  r   rr  rs  r   r  r  rV   inside_reductionr   	writeliner   stores)rG   r'  rx  valuemoder   	dtype_strcast_valry  atomic_typecast_vars              r8   storezMetalKernel.store  s<    iit$%%e,%%agg&7&7&=>	!)BugQ7<U!D--e45T(1ED\!LLX&3I;a@K1+jQOH!]-zD<M<Me<T;UUWX`WaacdD!:4&ABB  LL""<d#;<KK!!,tT":;r:   c                   | j                   j                  |      }| j                  |      }| j                  t        j
                  j                  |            }t        d | j                  D              }| d| j                  |       d| d| d}d|j                   d| }| j                  j                  t        ||             y )Nc              3  :   K   | ]  }|j                   s|  y wr   is_reductionr   ts     r8   r   z.MetalKernel.store_reduction.<locals>.<genexpr>  s     K1ANNQKs   rn  z] = static_cast<r   r  if (z == 0) )rD   r  r   rl  r   rr  rs  nextrange_treesr   r'  r  r  r   )rG   r'  rx  r  r   r  reduction_dimry  s           r8   store_reductionzMetalKernel.store_reduction  s    iit$%%e,%%agg&7&7&=>	K(8(8KKa))%011A)BugUWXm(()7l467r:   Tc                   t        |t        j                        r| j                  |      }dt	        | j
                         }t        j                  j                  |||      }|rdnd}|| d| z  }|r|d| j                  |       dz  }||rJ d       |d| z  }| j                  j                  || j                  z          |S )	Ntmp_acc_zthreadgroup   rn  ro  z+Thread group var can not have default value = )r3   r4   r   rl  r  rg  r   r   create_cse_varsexprindexing_coder  suffix)	rG   r   
elem_countdefault_valueis_threadgroupr   var_namer   var_defs	            r8   _new_idxvarzMetalKernel._new_idxvar  s     eU[[)%%e,Ed4#3#3456hh%%h>$2.eWAhZ((4::j12!44G$%T'TT%]O,,G$$Wt{{%:;
r:   c                    |||f}|| j                   j                  v r| j                   j                  |   S | j                  ||||      }|| j                   j                  |<   |S )z)Caching wrapper around _reduction_nocache)r   reduction_cache_reduction_nocache)rG   r   r   reduction_typer  	cache_keyr   s          r8   	reductionzMetalKernel.reduction3  sf     6	00088++I66((	>5Q.4  +r:   c                P   | j                   sJ | j                  rJ dAd}d}d}| j                  D ]  }|j                  s|r|dz  }||j                   d| z  }t        |j                  t        j                        r||j                  z  }_|t        j                  |j                   ddd      z  } t        j                  || j                        }| j                  |      }	t        |t        j                        rt        || j                        n| j                  }
|d	k(  r| j!                  |      }| j"                  j%                  | d
       | j"                  j%                  d       | j&                  j)                  d| d| d       | j*                  j%                  d       |S | j,                  j/                  d       |dv rt0        |   }| j!                  ||
      }| j2                  s|}nD|dk(  rdnd\  }}| j!                  ||d      }| j&                  j)                  | d| d| d       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |dv r| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}nY|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j&                  j)                  | d&| d| d| d'       | j4                  j7                  | j*                  d| d| d| d| d|	 dt0        |         S |d(v r]| j!                  ||
      }| j!                  ||
      }t8        |   }d| d | d}| j2                  s|}dt8        |    d | d}n|j;                  d!      rd"nd!}d#| d$| d%}| j!                  ||d      }| j!                  |d)d      }t=        d* | j>                  jA                         D              }|d+k(  rd,nd-}|jB                  rd.| d/nd}| j&                  j)                  d| d| d| | d| d0| d1| d0|j                   d2       | j4                  j7                  | j*                  d| d| d| d| d| d| d|	 d|      S |d3k(  r+| j2                  s~| j!                  ||      }| j&                  j)                  | d4| d5| d       | j4                  j7                  | j&                  d| d| d|	 dtD        jF                        } ||      S | j!                  d6|      }| d4| d7}| j"                  j)                  | d8       | j&                  j%                  | d9| d:| d;       | j4                  j7                  | j*                  d<| d| dtD        jF                        } ||      S |d=k(  r4t        |tH              sJ d>       | j!                  d6|      }| d4| d7}d?|d)    d|d    d|d@    d}| j"                  j)                  | d8       | j2                  rC| j"                  j)                  | d8       | j&                  j%                  | d9| d| d'       n!| j&                  j%                  | d0| d       | j4                  j7                  | j2                  r| j*                  n| j&                  d| d| d|	 dtD        jF                        } ||      S tK        |      )BzeCodegen a reduction operation.
        Only sum and prod operations are somewhat reasonable optimizedc           
         t        j                  dD cg c](  }t        |  d| | j                  | j                        * c}      S c c}w )Nxyzrk   )r   _unwrapr   r   r   )res3r  s     r8   _unwrap_helperz6MetalKernel._reduction_nocache.<locals>._unwrap_helperO  sA    %%NSTvQqc]DKKDT Ts   -Ar  r    + rm   numelTintegerpositiveanyz	 = false;z7threadgroup_barrier(metal::mem_flags::mem_threadgroup);z
                if (z) {
                    z' = true;
                }
            reduction_utils)prodsumr  )r   rR   )r   *F)r  r  r  z= r]  zc10::metal::threadgroup_rM   r?   r@   rp  )maxminr   r   r  lowestz::metal::numeric_limits<z>::z()z = ::c10::metal::r  )argminargmaxr   c              3  8   K   | ]  }|j                   r|  y wr   r  r  s     r8   r   z1MetalKernel._reduction_nocache.<locals>.<genexpr>  s      ~~ s   r  r  <z || ::metal::isnan(z) r  z;
                    z$;
                }
                welford_reducern  r|  float3ro  z = 0.0;z! = ::c10::metal::welford_combine(z	, float3(z, 0.0, 1.0));z(c10::metal::threadgroup_welford_combine(welford_combinez&Input to welford combine must be tuplezfloat3(r   )r  r   r   ztuple[CSEVariable, ...])&r  
_load_maskr  r  r'  r3   r  sympyIntegerSymbolprefixMinmax_threadgroup_sizer  r   simd_group_sizer  r  r  r   splicer  r  r  r   ra  r   r   r   endswithr  range_tree_nodesvaluesr
  r4   rv  tupleNotImplementedError)rG   r   r   r  r  r  reduction_idxacc_buf_sizerdacc_buf_size_strshmem_buf_sizeacc	acc_dtypeacc_bufr7   default_valreduction_opsrc_metal_type
cast_valuelim_fn	limit_valdata_acc_bufidx_acc_bufidx_validx_varcmp_op
nan_suffixwf_resacc_thread_var	inp_values                                 r8   r  zMetalKernel._reduction_nocacheC  s^    $$$$??""	 "" 	B??&yL>::M"((EMM2(yyk'! 	 yyt/H/HI::l3 ,6 L$"6"67%% 	 U"""5)C((C5	):;((I LLG E  KK!!I J*+_,29=I&&y.AG22 !/% 7HX *\ &&[ '  ##se1\N"UG1$EF88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 ^+&&y.AG+I6N''7r%BJ22 %3%<%<U%C6~6Fc&QST	&&Yu '  ##e,^,<AcU"ZLPRS 88$$*>*:!G9Bse2m_\^_o^ppqr07 %  
 11++I~FL**5.AK+I6N''7r%BJ22 ()>(?r-PQR%3%<%<U%C6~6Fc&QST	&&Yu '  **5RW*X !2299;  !/( : !22 *%3 
 ## )G1VHAcU:, 7EUG $IS /%  88$$*>*:!L>K=XZ%r'"]O26F5GqJ	 %   --22**9lC##wiqtE7!$LM**LL.~.>ayK[J\\]^-- + 
 &f--&&x>G 'y-:N%%(8&@ALL""!""CNCSS\]b\ccpq XX&&:7)2l^STUmm ' F
 "&))..eU+U-UU+&&x>G 'y-:N!%(2eAhZr%(1EI%%(8&@A..""))^,<G*DE&&%&&GGWWYZcYddfg &&.)9YKq'IJXX&&#>>DLL*>*:!G9BGWFXXYZmm ' F
 "&))!.11r:   c                ,   | j                  |j                        }| j                  |      }|j                  rQt	        |j
                  j                  t        j                        r\|j
                  j                  | j                  k  r9| j                  j                  | j                   d|j                   d| d       y t	        |j
                  j                  t        j                        r|j
                  j                  n.t        j                  |j
                  j                   ddd      }| j                   j#                  |       |t%        | j                  dz
        z   t%        | j                        z  }| j                  |      }| j&                  j                  d|j                   d	|j                   d
| d|j                   d	       | j&                  j)                         5  t	        |t        j                        rS| j&                  j                  | j                   d|j                   d| j                   d|j                   d| d
       nH| j&                  j                  | j                   d|j                   d| d| d|j                   d
       t	        |t        j                        s|| j                  z  |k7  r,| j&                  j                  d|j                   d| d       d d d        y # 1 sw Y   y xY w)Nr  r  r]  r  Tr  r   z	for(auto z
_cnt = 0; z_cnt < z; ++z_cnt) {rm   z_cnt + r  z_cnt;r  z >= z) break;)rename_indexingrH   r  r  r3   rootr  r  r  r  r  r  index_dtyper'  r  r  ra  appendr+   r   indent)rG   entryr   	index_stracc_size	loop_sizeloop_size_strs          r8   codegen_iteration_ranges_entryz*MetalKernel.codegen_iteration_ranges_entry  s   ))%**5
JJz*	 !!uzz''7

  D$=$==((##$Aejj\YKqA  %****EMM: JJ!2!2 3594RVW 	 	''..u5 d&?&?!&C DD%%J
 
	 

9-		

|:ejj\tTYT^T^S__gh	
 YY 	O(ELL1		##''(%**S9R9R8SSVW\WaWaVbbijsittuv 		##''(%**Ss9+UXY^YcYcXddij 8U\\2t888HD		##d5::,d8*H$MN	O 	O 	Os   3DL

Lc                   | j                   r-| j                  j                         5  | j                  j                  | j                         | j                  j                  | j
                         ddd       | j                  j                  dt        | j                         z         | j                  j                  t        d | j                  j                  j                         D                     | j                   r| j                   j                         j                          | j                   r5nJ| j                  j                  | j                         | j                  j                  | j
                         | j                  j                  | j                         | j                  j!                          | j
                  j!                          | j                  j!                          y# 1 sw Y   xY w)a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        N}c              3  T   K   | ]   }t        |t              r|n|fD ]  }|  " y wr   )r3   r  )r   itemvs      r8   r   z+MetalKernel.codegen_body.<locals>.<genexpr>C  s:      &0u&=dD7  s   &()ra  r   r  r  rw  r   r  rU   r   
invalidater
   r  r  popcache_clearr  clear)rG   s    r8   codegen_bodyzMetalKernel.codegen_body/  sc    **!!# /		  ,		  ./ IIc$*I*I&J JK
 HH  $ 8 8 ? ? A  11//335AAC 11 IITZZ(IIT\\*		%

1/ /s   AG99Hc                *   | j                          t               }t        j                  j                  r|j                  d       n|j                  d       | j                         }|j                         5  t        j                  j                  s'| j                  D ]  }|j                  d| d        nr| j                  D cg c]  }d| d
 }}t        |t        t              j                  j                  j                  dz  gt                     }|j                  |       | j                  rwt        j                   d | j"                  D              }t%        |t&        j(                        rt+        || j,                        n| j,                  }|j                  d| d       |j                  d	       |j                         5  | j.                  j0                  j3                         D ]Z  \  }	}
|	| j4                  v r| j7                  t        j                  j9                  |	            }|j                  d
| d|
 d       \ | j.                  j:                  j3                         D ]  \  }	}
t        j                  j9                  |	      }|t<        j>                  k(  rBt        j                  jA                  |	      }||jC                         g k7  rtE        d      d}n| j7                  |      }|j                  d| d|
 d        | j.                  jF                  jI                         D ]  }
|j                  d|
 d        |D ]F  }t%        |jJ                  t&        j(                        r(|j                  d|jL                   d       H tO        |      dk  sJ d       tO        |      dkD  rdtO        |       nd}tO        |      dk(  r|d   jP                  nd}| j                  rdnd}|j                  | d| d|        | j                  r|j                  | d       ddd       |j                  d       |j                         5  tO        |      dkD  rAtS        |      D ]3  \  }}|j                  d|jP                   dtU        d |z          d!       5 |jW                  | jX                         |jW                  | jZ                         ddd       |j                  d"       ddd       t        j                  j                  r!|j                  d#       |j]                         S |j                  d$       |j]                         S c c}w # 1 sw Y   3xY w# 1 sw Y   xY w# 1 sw Y   xY w)%z3Called at the end to generate a final kernel stringz(R"MTL(zcompile_mps_shader('''z#include <c10/metal/z.h>includec              3  L   K   | ]  }|j                   r|j                    y wr   )r  r  r  s     r8   r   z-MetalKernel.codegen_kernel.<locals>.<genexpr>o  s#      1~~ GG1s   "$z$[[max_total_threads_per_threadgroup(z)]]zkernel void generated_kernel(zdevice z* ,Nzfloat64 is not supported by MPSr+   z	constant zconstant long& znumel,   z%Up to 3 index variables are supportedr   uintr   
thread_posr  r  z [[thread_position_in_grid]]z- group_pos [[thread_position_in_threadgroup]]z) {r^  z = thread_pos.x   r]  r  z)MTL");z'''))/r   r   r   rr  cpp_wrapperr  active_range_treesr  r  r	   r   __file__parentr
   r  mathr  r  r3   r  r  r  r  rD   output_buffersitemsremoved_buffersrl  rs  input_buffersr4   float64try_get_bufferget_sizerV   sizevarsr  r  r  rU   r'  	enumeratechrr  r  r   getvalue)rG   r'  codeidx_varsheaderr  header_contentstotal_reduction_sizethreadgroup_sizeouterinnerr  r   	outer_bufr  thread_pos_dtypethread_pos_var_namethread_pos_suffixidxr   s                       r8   codegen_kernelzMetalKernel.codegen_kernelT  s	   77NN9%NN34**,[[] T	 77&&"ll GFNN%9&#EFG FJ\\;A*6(#6  #1(^**11889DEL#
 /$$'+yy 1!--1 ($ ""6F ,d.G.GH22 !
 :;K:LCP NN:; )$(II$<$<$B$B$D DLE5 4 44  $ 1 1!''2C2CE2J KINNWYKr%#BC	D
 %)II$;$;$A$A$C 
FLE5GG--e4E-$%GG$:$:5$A	$,	0B0B0D0J"./P"QQ$+	$($5$5e$<	NNYykE7!#DE
F "YY//668 ?ENN_UG1#=>?  ( QG!'--?8H'OP	Q 8}q(Q*QQ(.1(ma.?d3x=/*V ! ),H(:HQK$$ $ ,0+@+@Cb!'(*=)>>Z[lZmn ((NN+,,YZO)T NN5! 'x=1$$-h$7 S#CHH:^Cc	N;K1M D../DII&' NN3iT	 l 77NN9% }} NN6"}}m8) )V' 'YT	  T	 sR   8AV	U+C;V	I/U0?)V	(BU=.V	+V	0U:	5V	=V	V		Vc           	     l   t         j                  j                  }| j                  j                  D ]  }|j                  |        | j                  j                         \  }}}}t        ||      D 	
ci c]  \  }	}
t        |	      |
 }}	}
g | j                  j                  j                         | j                  j                  j                         }|D cg c]  }|| j                  vs| }}|| j                  j                  D cg c]  }t        |       c}z  }|D cg c]  }||   	 }}| j                  D ]  }t        |j                  t         j"                  t$        f      r.t        |j                  t         j&                        r|j                  }n4t         j                  j                  j)                  ||      j*                  }|j,                  r| j.                  s|j1                  t        |             |j1                  t$                t         j                  j2                  r| j4                  n| j6                  }dd}t9        | j;                               dkD  r| j;                         D cg c]J  } ||j,                  r*t!        j<                  |j                  | j>                        n|j                        L }}|j1                   ||d             |j1                  t@               n%t         j                  j2                  rtC        d      | j.                  r| j;                         D cg c]@  }|j,                  r0 |t!        j<                  |j                  | j>                              ndB }}|j1                   ||d             |j1                  t@               n1t         j                  j2                  r|dgz  }|j1                  d       |jE                  ||tG        jH                  d      d	|
       yc c}
}	w c c}w c c}w c c}w c c}w c c}w )z0
        Codegens a call to this kernel
        threadsc                    t         j                  j                  r(| D cg c]  }d| d
 } }ddj                  |        dS | ddj                  |        dS c c}w )Nzstatic_cast<uint64_t>(r@   {r?   r  z=[ro  )r   rr  r	  join)r(  kwargr  s      r8   format_threadsz/MetalKernel.call_kernel.<locals>.format_threads  sh    ww""BIJQ3A3a8JJDIIg./r22499W#5"6a88 Ks   Ar   zWe should always have threads?1
group_sizeNrZ  F)devicetriton	arg_types)r(  z	list[str]r,  r6   r   r6   )%r   rr  wrapper_coderD   r  ensure_size_computedpython_argdefszipr6   r  keysr  r  r  r3   r  r  r  r(   r  generate_numel_exprr   r  r  r  r	  cexprpexprrU   r
  r  r  listrV   generate_kernel_callr4   r0  )rG   r'  nodedeallocate_wswrapperr  _	call_argsr2  call_argarg_typearg_name_to_typerD   r   treerH   expr_printerr-  r(  s                      r8   call_kernelzMetalKernel.call_kernel  s|    ''&&## 	,A((+	, &*YY%=%=%?"9a>A)Y>W
(:(CM8#
 
 S))..0R4993J3J3O3O3QR#Gs$2F2F'FGG!3!34AQ446:;s%c*;	; $$ 	&D$**u}}c&:;DJJ5zzww++??dKQQ $$(=(=CI&  %	& &'WW%8%8tzzdjj	9 t&&()A- 002   ~~ IIaggt'@'@A 	G  KKw	:;T"ww"""#CDD   002
  >> UYYqww0I0IJK G  KKw=>T"ww""   &$$<<& 	% 	
K

 H4;:"s,   5PP0PP"*P'AP,AP1c                    |s|sy | j                  |      }|r| dnd}|r| d| j                  |       nd}|r|r
d| d| d}nd| | d}| j                  j                  | j                  |d	
       y )Nz < 0r  z > zif ((z) && (z	)) returnr  z) returnF)
assignment)r   r   r   r   )	rG   rH   sizelowerupperexpr_str
lower_expr
upper_exprry  s	            r8   check_boundszMetalKernel.check_bounds  s      $$T**/z&R
BGzT%6%6t%<$=>R
U:,fZL	BD*j\:D$,,?r:   )rh  zdict[str, sympy.Expr]ri  r   r   rW  )r   rU  r   r6   )r'  r6   rx  r   r   r   r   )
r'  r6   rx  r   r  r   r  r!   r   rW  )r'  r6   rx  r   r  r   r   rW  )r   zUnion[str | torch.dtype]r  zOptional[int]r  zOptional[Any]r  r%   r   zValueRanges[Any]r   r   )
r   rU  r   rU  r  r    r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   rQ  )r  r   r   rW  rV  )r'  zOptional[str]r   r6   rT  )r'  r6   r=  r   r>  r%   r   rW  )
rH   r   rJ  r   rK  r%   rL  r%   r   rW  )&r   r   r   r   r   	overridesr  newvar_prefixr  r  r   rE   r:  r   r9  r<   r  kexprr
   r  __annotations__ra  rd  rl  rz  r  r  r   unknownr  r  r  r  r   r&  rG  rP  __classcell__rj  s   @r8   r\  r\    s   EIFMOO##EL  E&&EE)7)4G_4=? :?-%- - 
	-%@ SW<< *<3><FO<	<*	8 %)'+##6;#6#6#8' " %	
  ! 
,  &	
 ; 
5 w2w2 w2 &	w2
 ;w2 
5w2r1Of#JfR BFW
W
"W
:>W
	W
r@@&0@9=@FJ@	@r:   r\  c                  <     e Zd ZeZd fdZ	 	 	 	 	 	 	 	 ddZ xZS )MetalSchedulingc                    t         |   |       t        j                  j                  }|7t        j                  j
                  s|j                  j                  d       y y y )NzDfrom torch._inductor.runtime.runtime_utils import compile_mps_shader)rc  rd  r   rr  r3  r	  r  r  )rG   	schedulerr?  rj  s      r8   rd  zMetalScheduling.__init__*  sQ    #''&&77&&%%Z ' r:   c                ^   t         j                  j                  }||j                  v r|j                  |   }|S d|j	                          }| }||j                  |<   t         j                  j
                  r	d| d|z   }t        ||      \  }}| d| }	|j                  |||	d       |S )Nmps_lib_zconst char* z
_source = 
F)gpu)r   rr  r3  src_to_kernelnext_kernel_suffixr	  r   define_kernel)
rG   src_codenode_scheduler   r?  kernel_namemps_lib_nameoriginsdetailed_originsmetadata_comments
             r8   rc  zMetalScheduling.define_kernel3  s     ''&&w,,,!//9K"  &g&@&@&B%CDL)NK.9G!!(+ww""),zBXM(;M7(S%G%")"-=,>?!!,:JPU!Vr:   )r\  zOptional[Scheduler]r   rW  )rd  r6   re  zlist[SchedulerNode]r   r\  r   r6   )r   r   r   r\  kernel_typerd  rc  rW  rX  s   @r8   rZ  rZ  '  s2    K,?IT	r:   rZ  )r7   z)Union[float, int, bool, str, CSEVariable]r   r6   )I
__future__r   rO  re  loggingr  pathlibr   typingr   r   r   r  sympy.printing.precedencer   r4   torch.utils._cpp_embed_headersr	   torch.utils._ordered_setr
   torch.utils._sympy.printersr   r   ExprPrinter_torch.utils._sympy.value_rangesr   r`  r   r   r   virtualizedr   r   r   commonr   r   r   r   r   r   simdr   r   r   r   ops_handlerr    r!   r\  r"   r#   r$   	getLoggerr   r   r%   int8int16int32int64uint8r+   r,   ru  r   r9   r<   r   _initialize_pointwise_overridesrS  r\  rZ  r   r:   r8   <module>r     s,   #      / /  0  9 / O 7 G G , ,  C B 64g! 
JJ	JJ	KK	KK	KK	KK	KK	JJ	NNH
h"| h"V][ ]@	  . .u 5  & & (O	@* O	@d"n "r:   