
    qi                      U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlmZmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZmZ d dl Z d dl!Z!d dl"Z!d dl#m$Z$ d d	l%m&Z' d d
l(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z0 d dl1m2Z2m3Z3m4Z4 d dl5m6Z6m7Z7 ddl8m9Z9m:Z: ddl;m<Z< ddl=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZSmTZTmUZUmVZV erkd dlWmXZXmYZYmZZZm[Z[ d dl"m\Z\ ddl]m^Z^ ddl_m`Z`maZambZbmcZc ddldmeZe ddlfmgZgmhZhmiZi ddl@mjZj ddlkmlZl  ed       ZmeXeeh   gegf   Zneoel   Zpeeqe j                  f   ZseqZte!j                  j                  ewd!      Zx ej                  ew      Zzdd"Z{ej                   G d# d$             Z} G d% d&ej$                        Z~ G d' d(e      Z eHd)*       G d+ d,e             Z G d- d.      Zej                   G d/ d0             Zej                   G d1 d2             Zej                   G d3 d4             Zej                   G d5 d6             Zej                   G d7 d8             Zeeeeeef   Zi Zd9ed:<    G d; d<      Zi Zd=ed><   i Zd?ed@<   i ZdAedB<   	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddCZ G dD dEe      Z	 	 	 	 ddFZ	 	 	 	 	 	 ddGZddHZ	 d	 	 	 	 	 	 	 ddIZddJZddKZej,                  ddL       Z	 	 	 	 	 	 	 	 ddMZ	 	 	 	 	 	 ddNZddOZe!j6                  e!j8                  e!j:                  e!j8                  ie!j<                  e!j>                  e!j@                  e!jB                  e!jD                  e!jF                  e!jH                  e!jJ                  e!jL                  e!jN                  e!jP                  fD  ci c]  } | |  c} ZdPedQ<   	 	 	 	 	 	 	 	 ddRZ	 	 	 	 	 	 	 	 ddTZ	 	 	 	 	 	 	 	 ddUZddVZ G dW dX      Z G dY dZe0      Z/ G d[ d\      Z ej`                  d]ejb                  ^      Zdd_Z G d` dae>eeRe         Zej                   G db dc             Z edi dd ee$jn                  de dfg      dh ee$jn                  di dj dkl      dm ee$jn                  dn do dpl      dq ee$jn                  dr ds dtl      du ee$jn                  dv dw dxl      dy ee$jn                  dz d{ dy|      d} ee$jn                  d~ d dl      d ee$jn                  d d d d      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d d d d      d ee$jn                  d d d|      d ee$jn                  d d dl      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d d dl      d ee$jn                  d d dl      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  dÄ dĬg      d ee$jn                  dƄ dǬg      d ee$jn                  dɄ dʬg      d ee$jn                  d̄ dͬg      d ee$jn                  dτ dЬg      d ee$jn                  d҄ dӬg      d ee$jn                  dՄ d֬g      d ee$jn                  d؄ d٬g      d ee$jn                  dۄ dܬg      d ee$jn                  dބ d߬g      d ee$jn                  d dg      d ee$jn                  d dg      d ee$jn                  d dg      Zded<   ddZ G d deD      Z G d deG      Z G d de      Zej                   G d d             Z G d d      Z e       Z G d d      Z G d d      Z edeq      Z edSee      Zeree!j                  eTeeeedf   f   f   Z G d deeef         Z G d  d      Z G d deee         Zej                   G d d             Zej,                  d d       Z G d d      Z G d	 d
e?      Zyc c} w (!      )annotationsN)ABCabstractmethod)autoEnum)chain)AnycastClassVarGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)SelfTypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)ConfigModule)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)ShapePropagationOpsHandler)boolean_opsDeferredLineBasegenerate_assertget_current_backendIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)NullHandlerops
OpsHandlerOpsValueReductionType	StoreModeV)CallableIteratorMutableMappingSequence)GraphModule)CustomGraphModulePass)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode)BlockShapeType   PythonWrapperCodegen_Tschedulec                x    t         j                  t        j                        rt         j	                  d|        y y )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)msgs    d/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/common.pydata_type_loggerrU   a   s*      /6< 0    c                  Z    e Zd ZU dZded<   ded<   ddZedd       ZddZedd	       Z	y
)FileBackedGraphModulez
    Output of FX wrapper codegen. Exposes the same methods as ModuleType, but these
    map back to a GraphModule instead of Python source.
    r<   gmzCallable[..., Any]compiled_fnc                &   t        j                  ddd      | _         t        j                  t        j
                  | j                   j                         | j                   5 }|j                  | j                         d d d        y # 1 sw Y   y xY w)Nzw+z.pyF)modesuffixdelete)	tempfileNamedTemporaryFileatexitregisterosremovenamewritevalue)selffs     rT   __post_init__z#FileBackedGraphModule.__post_init__p   si     !33eE
 			4==#5#56]] 	 aGGDJJ	  	  	 s   "BBc                .    | j                   j                  S N)r_   re   rh   s    rT   __file__zFileBackedGraphModule.__file__z   s    }}!!!rV   c                      | j                   | S rl   )rZ   rh   argss     rT   callzFileBackedGraphModule.call~   s    t&&rV   c                .    | j                   j                  S rl   )rY   coderm   s    rT   rg   zFileBackedGraphModule.value   s    ww||rV   NreturnNonerv   str)rq   	list[Any]rv   r	   )
__name__
__module____qualname____doc____annotations__rj   propertyrn   rr   rg    rV   rT   rX   rX   f   sF    
 	O##  " "'  rV   rX   c                  <    e Zd ZdZdZdZedd       Zedd       Zy)	WorkspaceZeroModer   rH   r   c                    | |k(  s|t         j                  k(  r| S | t         j                  k(  r|S t        d| d|d      )NzWorkspaceZeroMode.combine(, ))r   UNINITIALIZEDNotImplementedErrorabs     rT   combinezWorkspaceZeroMode.combine   sK    6Q+999H!///H!$>qe2aU!"LMMrV   c                F    | rt         j                  S t         j                  S rl   )r   ZERO_ON_CALLr   )	zero_fills    rT   	from_boolzWorkspaceZeroMode.from_bool   s    $111 ...rV   N)r   r   r   r   rv   r   )r   boolrv   r   )	r{   r|   r}   r   r   ZERO_PER_GRAPHstaticmethodr   r   r   rV   rT   r   r      s9    MLNN N / /rV   r   c                  4    e Zd ZdZedd       Zedd       Zy)CodegenSymbolzP
    An IR object possibly corresponding to a variable in the wrapper code.
    c                     y rl   r   rm   s    rT   get_namezCodegenSymbol.get_name       rV   c                     y rl   r   rm   s    rT   get_examplezCodegenSymbol.get_example   r   rV   Nrx   rv   z!Union[torch.Tensor, sympy.Symbol])r{   r|   r}   r~   r   r   r   r   rV   rT   r   r      s/        rV   r   T)frozenc                  &   e Zd ZU dZded<   ded<   ded<   ded	<   d
Zded<   ej                  Zded<   e	ddd       Z
e	d d       Ze	d!d       Ze	d!d       Zd"dZeZd#dZd$dZd%dZed%d       ZeZeZeZd&dZd'dZd'dZd(dZd)dZd*dZy)+WorkspaceArga2  A temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.

    Args:
        nbytes: The size of the buffer in bytes.
        zero_fill: Whether the buffer should be initialized to zero.

    
sympy.Exprcountr   	zero_modetorch.devicedevicery   
outer_namews_ptr
inner_nametorch.dtypedtypec                P    |  t        t        j                  j                         S rl   )nextr7   graphworkspace_id)prefixs    rT   unique_namezWorkspaceArg.unique_name   s!    $qww334566rV   c                    | j                   |j                   k(  xr4 | j                  |j                  k(  xr | j                  |j                  k(  S rl   )r   r   r   r   s     rT   can_joinzWorkspaceArg.can_join   s@     LLALL(XQWW-?XAHHPQPXPXDX	
rV   c                    t        | j                  |j                  z   t        j                  | j                  |j                        | j
                  | j                  | j                  | j                        S N)r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   s     rT   joinzWorkspaceArg.join   sS    ''AGG#'//Q[[I''88||||
 	
rV   c                   | j                   |j                   k(  r2| j                  |j                  k(  r| j                  |j                  k(  sJ t        t	        j
                  | j                  |j                        t        j                  | j                  |j                        | j                   | j                  | j                  | j                        S r   )r   r   r   r   sympyMaxr   r   r   r   r   r   s     rT   maximumzWorkspaceArg.maximum   s     GGqww188qxx#7ALLALL<X	
X))AGGQWW-'//Q[[I''88||||
 	
rV   c                    | j                   S rl   r   rm   s    rT   
get_devicezWorkspaceArg.get_device   s    {{rV   c                    | j                   S rl   r   rm   s    rT   	get_dtypezWorkspaceArg.get_dtype   s    zzrV   c                >    | j                         j                         S rl   )
get_layoutr   rm   s    rT   r   zWorkspaceArg.get_example   s     ,,..rV   c                f    ddl m}  || j                  | j                  | j                  gdg      S )Nr   )r@   rH   )r   r   sizestride)irr@   r   r   r   )rh   r@   s     rT   r   zWorkspaceArg.get_layout   s.    $;;****3	
 	
rV   c                "    | j                         S rl   )r   rm   s    rT   layoutzWorkspaceArg.layout   s      rV   c                6    t         j                  j                  S rl   )r   SZerorm   s    rT   
get_offsetzWorkspaceArg.get_offset   s    ww||rV   c                    | j                   gS rl   )r   rm   s    rT   get_sizezWorkspaceArg.get_size   s    

|rV   c                8    t         j                  j                  gS rl   )r   r   Onerm   s    rT   
get_stridezWorkspaceArg.get_stride  s    }rV   c                    | j                   S rl   )r   rm   s    rT   r   zWorkspaceArg.get_name  s    rV   c                     y)NFr   rm   s    rT   get_is_pinnedzWorkspaceArg.get_is_pinned	  s    rV   c                    g S rl   r   rm   s    rT   get_inputs_that_alias_outputz)WorkspaceArg.get_inputs_that_alias_output  s    	rV   N)
workspace_)r   ry   rv   ry   )r   r   r   r   rv   r   )r   r   r   r   rv   r   )rv   r   )rv   r   r   )rv   r@   )rv   r   )rv   list[sympy.Expr]rx   )rv   r   )rv   	list[str])r{   r|   r}   r~   r   r   torchuint8r   r   r   r   r   r   r   get_device_or_errorr   r   r   r   r   get_output_specmaybe_get_output_specmaybe_get_layoutr   r   r   r   r   r   r   rV   rT   r   r      s    	   OJE;$7 7 
 

 
 
 
 
 %/
 ! ! !O&!rV   r   c                      e Zd ZddZddZy)TritonScratchWorkspacec                     || _         || _        y rl   )r   _generate_dtype_str)rh   r   generate_dtype_strs      rT   __init__zTritonScratchWorkspace.__init__  s    	#5 rV   c                "    | j                         S rl   )r   rm   s    rT   r   z)TritonScratchWorkspace.generate_dtype_str  s    ''))rV   N)r   intr   Callable[..., str]rx   )r{   r|   r}   r   r   r   rV   rT   r   r     s    6*rV   r   c                  p    e Zd ZU ded<   ded<   ded<   ej
                  j                  Zded<   dZd	ed
<   y)	TensorArgry   re   bufferr   r   r   offsetNOptional[str]alias_of)	r{   r|   r}   r   r   r   r   r   r   r   rV   rT   r   r     s.    
IKFJ%"Hm"rV   r   c                  4    e Zd ZU ded<   ded<   edd       Zy)SizeArgry   re   r   exprc                     y rl   r   rm   s    rT   r   zSizeArg.alias_of'  s    rV   Nrv   r   )r{   r|   r}   r   r   r   r   rV   rT   r   r   "  s    
I
 rV   r   c                      e Zd ZU ded<   y)ConstexprArgry   re   Nr{   r|   r}   r   r   rV   rT   r   r   ,  s    
IrV   r   c                  6    e Zd ZU ded<   ded<   ded<   ded<   y)	TMADescriptorArgry   re   api_typezOptional[list[sympy.Expr]]block_shapeOptional[torch.dtype]r   Nr   r   rV   rT   r   r   1  s    
IM++  rV   r   c                  >    e Zd ZU ded<   ded<   dZded<   dZded<   y)	DeviceCodegenSchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNOptional[WrapperConstructor]cpp_wrapper_codegenfx_wrapper_codegen)r{   r|   r}   r   r   r  r   rV   rT   r   r   9  s&    %%''8<5<7;4;rV   r   zdict[str, DeviceCodegen]device_codegensc                      e Zd ZddZddZddZddZddZddZddZ	ddZ
dd	Zdd
ZddZddZddZddZddZddZ	 d	 	 	 	 	 	 	 ddZy)DeviceOpOverridesc                    t         rl   r   rh   re   s     rT   import_get_raw_stream_asz*DeviceOpOverrides.import_get_raw_stream_asG      !!rV   c                    t         rl   r  rh   
device_idxs     rT   
set_devicezDeviceOpOverrides.set_deviceJ  r	  rV   c                    t         rl   r  rm   s    rT   synchronizezDeviceOpOverrides.synchronizeM  r	  rV   c                    t         rl   r  r  s     rT   device_guardzDeviceOpOverrides.device_guardP  r	  rV   c                    t         rl   r  rm   s    rT   cpp_device_guardz"DeviceOpOverrides.cpp_device_guardS  r	  rV   c                    t         rl   r  rm   s    rT   cpp_aoti_device_guardz'DeviceOpOverrides.cpp_aoti_device_guardV  r	  rV   c                    t         rl   r  rm   s    rT   cpp_stream_guardz"DeviceOpOverrides.cpp_stream_guardY  r	  rV   c                    t         rl   r  rm   s    rT   cpp_aoti_stream_guardz'DeviceOpOverrides.cpp_aoti_stream_guard\  r	  rV   c                    t         rl   r  rm   s    rT   cpp_getStreamFromExternalz+DeviceOpOverrides.cpp_getStreamFromExternal_  r	  rV   c                    t         rl   r  rm   s    rT   kernel_headerzDeviceOpOverrides.kernel_headerb  r	  rV   c                    t         rl   r  rm   s    rT   kernel_driverzDeviceOpOverrides.kernel_drivere  r	  rV   c                    t         rl   r  rm   s    rT   cpp_stream_typez!DeviceOpOverrides.cpp_stream_typeh  r	  rV   c                    t         rl   r  rm   s    rT   aoti_get_streamz!DeviceOpOverrides.aoti_get_streamk  r	  rV   c                    t         rl   r  rm   s    rT   cpp_kernel_typez!DeviceOpOverrides.cpp_kernel_typen  r	  rV   c                    t         rl   r  rm   s    rT   cpp_device_ptrz DeviceOpOverrides.cpp_device_ptrq  r	  rV   c                    t         rl   r  rm   s    rT   tma_descriptor_helpersz(DeviceOpOverrides.tma_descriptor_helperst  r	  rV   Nc                    t         rl   r  )rh   idx	workspacer   s       rT   cpp_scratchzDeviceOpOverrides.cpp_scratchw  s
     "!rV   re   ry   rv   ry   )r  r   rv   ry   rx   rl   )r+  r   r,  r   r   r   rv   zOptional[tuple[list[str], str]])r{   r|   r}   r  r  r  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r-  r   rV   rT   r  r  F  s~    """""""""""""""" TX""#9"CP"	("rV   r  zdict[str, DeviceOpOverrides]device_op_overrides_dictz*dict[str, Optional[CustomGraphModulePass]]custom_backend_passesz!dict[str, Optional[ConfigModule]]custom_backend_codegen_configsc                    t        ||||      t        | <   |t        | <   |r)t        |t              r|t
        usJ d|dt
               |t        | <   y )Nzdevice_custom_config=z: cannot be the same as the default inductor config config=)r   r  r0  
isinstancer   r   r1  )r   device_schedulingdevice_wrapper_codegendevice_cpp_wrapper_codegendevice_fx_wrapper_codegendevice_custom_passdevice_custom_configs          rT   register_backend_for_devicer:    sv     ,"!	OF %7&!+\:$F2	
 %#%%`Y_Xab		
3
 .B"6*rV   c                      e Zd Z e       Z e       Z e       Z e       Z e       Z e       Z	 e       Z
 e       Z e       Z e       Zy)BackendFeatureN)r{   r|   r}   r   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTUPLE_REDUCTIONPREFER_STORE_LOOP_ORDERTRITON_TEMPLATESREDUCE_TO_SINGLE_ELEMENTr   rV   rT   r<  r<    sL    fGIfO $6D6DfO"fv#vrV   r<  c                :   | 
t               S t                t        | t        j                        r| j
                  }n7t        | t              sJ t        |              | }t        j                  |      } t        |      }|sJ  |d       }|j                  |       S rl   )	r   init_backend_registrationr3  r   r   typery   get_scheduling_for_deviceget_backend_features)r   device_typescheduling_ctorr   s       rT   rK  rK    s     ~|&%,,'kk&#&4V4&k*/<O? &J**622rV   c                @    t        |t              sJ |t        |       v S )zSee also V.graph.has_feature)r3  r<  rK  )r   features     rT   has_backend_featurerP    s%     g~...*6222rV   c                <    | t         v rt         |    j                  S d S rl   )r  r   r   s    rT   rJ  rJ    s     17?1J?6"--TPTTrV   c                v    | t         v r1t         |    }|r|j                  S |r|j                  S |j                  S y rl   )r  r  r   r   )r   cpp_wrapper
fx_wrapperwrapper_codegen_objs       rT   get_wrapper_codegen_for_devicerV    sD      -<V-D&999&:::&666rV   c                ,    t         j                  |       S rl   )r0  getr   s    rT   "get_custom_backend_pass_for_devicerY    s     $$V,,rV   c                ,    t         j                  |       S rl   )r1  rX  r   s    rT   $get_custom_backend_config_for_devicer[    s    )--f55rV   c                    ddl m}  ddlm} ddlm} ddlm} ddlm	} ddl
m} ddlm} dd	lm} dd
lm} ddlm}	 ddlm}
 ddlm} ddlm} t5        d      5| ||
|dt7        dfd|t8        j:                  j<                  r|n||       t5        d      |||dt7        dfd|||       t5        d      t7        d|
|||       t5        d      t7        d||||       t5        d      t7        d|
|	||       t>        j@                  jC                         }|dk7  rLt5        |      @ddl"m#} 	  |d      } |d      } |d      } |d      }|r|r|rt7        |||||       yyyyyy# tH        $ r Y yw xY w) z
    Register the backend for different devices, including the scheduling
    for kernel code generation and the host side wrapper code generation.
    rH   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CppWrapperMps)CUDACombinedScheduling)HalideScheduling)MetalScheduling)PallasScheduling)PythonWrapperMtia)TritonSchedulingrI   )WrapperFxCodegencpuN)cpphalidetritonpallasc                6     t         j                     |       S rl   )r   cpu_backend)r   cpu_backendss    rT   <lambda>z+init_backend_registration.<locals>.<lambda>  s    ?|F,>,>?
K rV   cuda)rl  rk  rm  c                6     t         j                     |       S rl   )r   cuda_backend)r   cuda_backendss    rT   rq  z+init_backend_registration.<locals>.<lambda>!  s    A}V-@-@A*M rV   xpumpsmtiaprivateuseoner   )_get_custom_mod_func
SchedulingrJ   CppWrapperCodegenrh  )%rj  r]  cpp_wrapper_cpur^  cpp_wrapper_cpu_array_refr_  cpp_wrapper_gpur`  cpp_wrapper_mpsra  cuda_combined_schedulingrb  rk  rc  rw  rd  rm  re  python_wrapper_mtiarf  rl  rg  wrapperrJ   wrapper_fxirrh  rJ  r:  r   aot_inductorallow_stack_allocationr   _C_get_privateuse1_backend_name torch.utils.backend_registrationrz  RuntimeError)r]  r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rJ   rh  private_backendrz  r4  r   r   r  rp  ru  s                      @@rT   rH  rH    s    #.@..@($(6(-. '/ &&&	
 	$K ""99 "	
 !(0 -&&

 	$M 	
 !'/# 	
 !'/# 	
 !(0#	
 hh<<>O?*%o6>I	 4\ B23IJO"67J"K!56H!I _9L+#%#'& :M_  ? 	+$  		s   75E2 2	E>=E>c                L    ddl m} g | t        ||j                  |            S )Nr   )FlexibleLayout)r   r  r,   contiguous_strides)index
index_varssizesr  s       rT   index_prevent_reorderingr  Z  s,    
 $ UUTIj.*K*KE*RSTTrV   c                    |t         | <   y rl   )r/  )r   device_op_overridess     rT   register_device_op_overridesr  e  s     (;V$rV   c                    t        | t              sJ t        |              t        sddlm}m} ddlm} ddl	m} ddl
m} t        |    S )NrH   )cpu_device_op_overridesmps_device_op_overrides)r  )r3  ry   rI  r/   r  r  rr  r  rx  rv  )r   r  r  r  mtia_op_overridesxpu_op_overridess         rT   get_device_op_overridesr  k  s5    fc"0DL0"#F-B@#F++rV   zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEc                r   | t               v rt        j                  S | dv rd|v r|d   S |d   S | dv rt        j                  S | dv rt        j                  S | dk(  rd|v r|d   S |d   S | dk(  rd|v r|d   S |d   S | d	v r$|d   }t
        j                  j                  |      S | d
k(  rd|v r|d   S |d   S y)zK
    Given op name and a list of input dtypes, deduce the output dtype
    )to_dtype
index_exprr   )randrandn)	get_index	randint64	load_seed	reductionrH   constant)loadstorestore_reductionto_dtype_bitcastN)r%   r   r   floatint64r7   r   r   )op_namerq   kwargsbuf_names       rT   deduce_output_dtype_by_namer    s
    +-zz	  
 #*V"3vgAbA	  
 {{	  

 {{	K	")V"3vg@a@	J	")V"3vgAbA	  

 7ww  **	&	&")V"3vgAbArV   CSEVariableTypec                   t               }t        j                  j                  r'|dk(  r"| j	                  d| dt        |       d       y t        j                  j                  r|dk(  rddlm}m	} t        ||      sJ t        |             |t        j                  k(  r|j                  rd| d	}n.d
| d| d}n$d| d}|j                  rd| d}d| d||    d}| j	                  d| d       y y y )Nrl  tl.static_assert(z
.dtype == r   rj  rH   )CppCSEVariableDTYPE_TO_CPPzIsVecMaskType<decltype(z	)>::valuezstd::is_same_v<decltype(z$), bool> || std::is_same_v<decltype(z), int>z	decltype(z	typename z::value_typezstd::is_same_v<r   >zstatic_assert(z);)r(   r   test_configsruntime_triton_dtype_assert	writeliner/   static_cpp_dtype_assert	cpp_utilsr  r  r3  rI  r   r   is_vec)r   varr   backendr  r  
is_same_dt
c_var_types           rT   check_dtyper    s    "#G667h;N,SEK<N;OqQR				4	4E9I;#~.9S	9.EJJzz6se9E
  8u<`ad`eelm
$SE+Jzz(LA
*:,be9L8MQOJ>*R89! :J	4rV   c                    t               }|J t        j                  j                  rM|dk(  rGt	        |      dk7  rdj                  d |D              n|d    d}| j                  d| d| d	       y y y )
Nrl  rH   r   c              3  2   K   | ]  }t        |        y wrl   ry   ).0ds     rT   	<genexpr>zcheck_shape.<locals>.<genexpr>  s     ,c!f,s   r   ,r  z.shape == ()))r(   r   r  runtime_triton_shape_assertlenr   r  )r   r  shaper  	shape_strs        rT   check_shaper    s     "#G667h;N03E
aDII,e,,azQR^ 	 	,SEYKrJK	 <O6rV   c                j    t               }|dk(  r$d}| j                  d| d| d| d| d| d       y y )	Nrl  zNaN or Inf foundztl.device_assert((z == ) & (z != float('inf')) & (z != float('-inf')), 'z'))r(   r  )r   r  r  rS   s       rT   	check_nanr    sS    !#G(  T#eC58McURghkgllno	
 rV   c                  `    e Zd Zd
dZddZddZddZddZddZe	dd       Z
e	dd       Zy	)DataTypePropagationc                    || _         d|j                  j                  i| _        |j                  j                         D ]  \  }}|j                  | j                  |<     y Nroot)body
root_blockr   graphs	subblocksitems)rh   r  kvs       rT   r   zDataTypePropagation.__init__  sU    	DOO))B
 NN((* 	%DAqWWDKKN	%rV   c                   |j                   }|D cg c]9  }t        |t        j                  j                        s(|j
                  dk7  s8|; }}t        |      dk(  ry t        d |D              }|sy t        j                  t        j                  |D cg c])  }|j                  t        j                     j                  + c}      S c c}w c c}w )Nplaceholderr   c              3     K   | ]K  }t         j                  |j                  v xr) |j                  t         j                     j                  d u M y wrl   )OptimizationContextkeymetar   )r  ns     rT   r  zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>  sS      )
   ##qvv- B*../55TAB)
s   AA)all_input_nodesr3  r   fxNodeopr  all	functoolsreducepromote_typesr  r  r  r   )rh   nodeinputsr  input_nodesall_input_nodes_propagateds         rT   deduce_node_dtype_by_inputsz/DataTypePropagation.deduce_node_dtype_by_inputs  s    %%
Auxx}}!=!$$-BWA
 
 {q %( )
 !)
 &
"
 *<GHqQVV'++,22H
 	

  Is   )CCC.C
c                b    | j                   |j                     }| j                  |      }|sJ |S rl   )r  targetpropagate_graph)rh   r  	sub_graphr   s       rT   deduce_node_dtype_by_subgraphz1DataTypePropagation.deduce_node_dtype_by_subgraph  s0    KK,	$$Y/urV   c                   |j                   dk(  ry |j                  dk(  rt        |j                        dk7  ry |j                  t        j
                  u rT|j                  d   }t        |t        j                  j                        sJ t        |             | j                  |      S t        |j                  t              sJ t        |j                               |j                  j                  d      r| j                  |      S t        |j                  g|j                  i |j                   x}	 |S | j#                  |      S )Nr  outputrH   r   masked_subblock)r  r  r  rq   operatorgetitemr3  r   r  r  rI  deduce_node_dtypery   
startswithr  r  r  r  )rh   r  node_argoutput_dtypes       rT   r  z%DataTypePropagation.deduce_node_dtype	  s   77m#;;("s499~':;;(***yy|Hh6FXF6))(33$++s+>T$++->>+;;!!"3455d;; 8 ++ L
   //55rV   c                n   |j                   sJ d }|j                   D ]  }t        j                  |j                  v r|j                  t        j                     }n
t               }| j	                  |      |_        ||j                  t        j                  <   |j                  dk(  s|j
                  } |S )Nr  )nodesr  r  r  r  r   r  )rh   r   graph_dtyper  opt_ctxs        rT   r  z#DataTypePropagation.propagate_graph&  s    {{{-1 KK 		,D"&&$))3))$7$;$;<-/ 2248GM18DII)--.{{h&%mm		, rV   c                >    | j                  | j                  d         S r  )r  r  rm   s    rT   	propagatezDataTypePropagation.propagate8  s    ##DKK$788rV   c                .     | |      j                         S rl   )r  )clsr  s     rT   propagate_loopbodyz&DataTypePropagation.propagate_loopbody;  s    4y""$$rV   c                    ddl m} ddlm} t	        ||      sJ t        |             t	        |j                  |      sJ t        |j                               t        j                  |j                        S )Nr   rB   )rF   )		loop_bodyrC   	schedulerrF   r3  rI  _bodyr  r  )r
  r  rC   rF   s       rT   propagate_scheduler_nodez,DataTypePropagation.propagate_scheduler_node?  sX    (-$.:T
:.$**h/Adjj1AA/"55djjAArV   N)r  rC   rv   rw   )r  torch.fx.Noderv   r   )r  r  rv   r   )r   ztorch.fx.Graphrv   r   )rv   r   )r  rC   rv   r   )r  rF   rv   r   )r{   r|   r}   r   r  r  r  r  r  classmethodr  r  r   rV   rT   r  r    sJ    %
*6:$9 % % B BrV   r  c                  D     e Zd Zddd	 	 	 	 	 	 	 d fdZdd fdZ xZS )r   T)simplifypc                   |r]t        |t        j                        rCt        t        j
                  d      r)t        j
                  j                  j                  |      }t        | %  |      S )Nsizevars)
r3  r   Exprhasattrr7   r   r  r  superdoprint)rh   r   r  r  	__class__s       rT   r  zPythonPrinter.doprintJ  sK     
44*9U77##,,T2Dwt$$rV   c                    t        |t        j                        rd| j                  |       dS t        |   |||      S N(r   )r3  r   Mod_printr  parenthesize)rh   itemlevelstrictr  s       rT   r"  zPythonPrinter.parenthesizeR  s@    dEII& t{{4()++7'eV<<rV   )r   r   r  r   r  r   rv   ry   )F)r#  r   r$  r   r%  r   rv   ry   )r{   r|   r}   r  r"  __classcell__r  s   @rT   r   r   I  s7    48D%%-1%=A%	%= =rV   r   c                  T   e Zd ZdZedd       Zedd       Zedd       Zedd       Zedd       Z	edd       Z
edd       Zedd	       Zedd
       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zedd       Zy)OpDecompositionsz!
    Decomposes inductor ops
    c                    | S rl   r   )rg   s    rT   identityzOpDecompositions.identity`  s	     rV   c                r    t        j                  t        j                  dt        j                        |       S NrH   )r2   truedivr  r   int32xs    rT   
reciprocalzOpDecompositions.reciprocale  s"    {{3<<5;;7;;rV   c                .    t        j                  | |       S rl   )r2   mulr0  s    rT   squarezOpDecompositions.squarei  s    wwq!}rV   c                    t        j                  t        j                  dt        j                        t        j
                  |             S r-  )r2   subr  r   float32erfr0  s    rT   erfczOpDecompositions.erfcm  s*    wws||Au}}5swwqzBBrV   c                    t        j                  t        j                  t        j                  |             t        j                  |             S rl   )r2   r4  expr5  r:  r0  s    rT   erfcxzOpDecompositions.erfcxq  s,    wwswwszz!}-sxx{;;rV   c                    t        j                  t        j                  |       t        j                  dt        j
                              S r-  )r2   r7  r<  r  r   r8  r0  s    rT   expm1zOpDecompositions.expm1u  s*    wwswwqz3<<5==#ABBrV   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrH   
   r2   r4  logr  mathr   r8  r0  s    rT   log10zOpDecompositions.log10y  s7    wwswwqz3<<DHHRL0@%--#PQQrV   c           	         t        j                  t        j                  |       t        j                  dt	        j                  d      z  t
        j                              S )NrH   r   rB  r0  s    rT   log2zOpDecompositions.log2}  s6    wwswwqz3<<DHHQK#OPPrV   c           
         t        j                  t        j                  | t        j                  t	        j
                  d      t        j                                    S )Nr   )r2   r<  r4  r  rD  rC  r   r8  r0  s    rT   exp2zOpDecompositions.exp2  s3    wwswwq#,,txx{EMM"JKLLrV   c           	         t        j                  t        j                  | t        j                  dt        j
                                    S r-  )r2   rC  addr  r   r/  r0  s    rT   log1pzOpDecompositions.log1p  s+    wwswwq#,,q%++">?@@rV   c                    t        j                  dt        j                        }t        j                  |t        j
                  |t        j                  t        j                  |                         S r-  )r2   r  r   r/  r.  rK  r<  neg)r1  ones     rT   sigmoidzOpDecompositions.sigmoid  sC    ll1ekk*{{3SWWSWWQZ-@ ABBrV   c                r    t        j                  | t        j                  dt        j                              S Nr   )r2   r   r  r   r/  r0  s    rT   reluzOpDecompositions.relu  s"    {{1cll1ekk:;;rV   c                V    t        j                  t        j                  | |      |      S rl   )r2   rK  r4  r1  yzs      rT   fmazOpDecompositions.fma  s     wwswwq!}a((rV   c                T    t        j                  t        j                  |       |      S rl   )r2   r  floorr   r   s     rT   floor_to_intzOpDecompositions.floor_to_int      ||CIIaL%00rV   c                T    t        j                  t        j                  |       |      S rl   )r2   r  ceilr[  s     rT   ceil_to_intzOpDecompositions.ceil_to_int  s    ||CHHQK//rV   c                T    t        j                  t        j                  |       |      S rl   )r2   r  truncr[  s     rT   trunc_to_intzOpDecompositions.trunc_to_int  r]  rV   c           	        t        j                  | |      }t        j                  t        j                  |t        j                  dt
        j                              t        j                  t        j                  |      t        j                  |                  }t        j                  |t        j                  ||      |      S rR  )
r2   modand_ner  r   r/  signbitwhererK  )r   r   rconds       rT   	remainderzOpDecompositions.remainder  sy    GGAqMxxFF1cll1ekk23FF3;;q>3;;q>2
 yyswwq!}a00rV   c                T    t        j                  t        j                  |       |      S rl   )r2   r  roundr[  s     rT   round_to_intzOpDecompositions.round_to_int  r]  rV   N)rg   OpVarTrv   rp  r1  rp  rv   rp  )r1  rp  rV  rp  rW  rp  rv   rp  )r   rp  r   r   rv   rp  r   rp  r   rp  rv   rp  )r{   r|   r}   r~   r   r+  r2  r5  r:  r=  r?  rE  rG  rI  rL  rP  rS  rX  r\  r`  rc  rl  ro  r   rV   rT   r)  r)  [  s}      < <   C C < < C C R R Q Q M M A A C C < < ) ) 1 1 0 0 1 1 1 1 1 1rV   r)  z[a-z0-9_.]+|\([^)]*\)|)flagsc                    | d   dk7  st        |       dk  ryd}t        | dd        D ]3  \  }}|dk(  r|dz  }n
|dk(  r|dz  }|dk(  s!|t        |       dz
  k7  s3 y |dk(  sJ y)Nr   r  r   FrH   r   T)r  	enumerate)stringr   ichars       rT   _all_in_parensry    s    ayC3v;?EVABZ( 43;QJES[QJEA:!s6{Q. A::rV   c                  X   e Zd Zed"d       Zed#d       Zed$d       Zed%d       Zed&d       Zed&d       Z	ed&d       Z
ed&d       Zed&d	       Zed'd
       Zed(d       Z	 	 d)	 	 	 	 	 	 	 	 	 d*dZ	 	 	 	 	 	 	 	 	 	 d+dZd,dZ	 d-	 	 	 	 	 	 	 	 	 d.dZd/dZd0dZ	 	 	 	 	 	 	 	 	 	 d1dZ	 	 	 	 	 	 	 	 d2dZ	 	 	 	 	 	 	 	 	 	 d3dZ	 	 d4	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d5dZd6dZd&dZdej8                  ddd	 	 	 	 	 	 	 	 	 	 	 	 	 d7dZd8dZd9dZed:d       Z e!d;d        Z"e!d<d!       Z#y)=OpOverridesc                r    t        | t              s t        j                  |       st	        |       r| S d|  dS r  )r3  CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchry  )rv  s    rT   parenzOpOverrides.paren  s9     v{+#--f5f% M6(!}rV   c                    t        |       S rl   )repr)rg   r   s     rT   r  zOpOverrides.constant  s    E{rV   c                2    dt         j                  |        S )N~r{  r  r0  s    rT   bitwise_notzOpOverrides.bitwise_not  s    ;$$Q'())rV   c                2    t         j                  |        dS )Nz == 0r  )r   s    rT   logical_notzOpOverrides.logical_not  s    ##A&'u--rV   c                \    t         j                  |        dt         j                  |       S )Nz & r  r1  rV  s     rT   bitwise_andzOpOverrides.bitwise_and  +    ##A&'s;+<+<Q+?*@AArV   c                \    t         j                  |        dt         j                  |       S )Nz | r  r  s     rT   
bitwise_orzOpOverrides.bitwise_or  r  rV   c                \    t         j                  |        dt         j                  |       S )Nz ^ r  r  s     rT   bitwise_xorzOpOverrides.bitwise_xor  r  rV   c                \    t         j                  |        dt         j                  |       S )Nz << r  r  s     rT   bitwise_left_shiftzOpOverrides.bitwise_left_shift  +    ##A&'tK,=,=a,@+ABBrV   c                \    t         j                  |        dt         j                  |       S )Nz >> r  r  s     rT   bitwise_right_shiftzOpOverrides.bitwise_right_shift  r  rV   c                .    t        j                  | |      S rl   )r2   r.  r   s     rT   int_truedivzOpOverrides.int_truediv  s    
 {{1a  rV   c                T    t        j                  | t        j                  |            S rl   )r2   r  r   Integer)re   r   s     rT   r  zOpOverrides.load_seed  s    xxemmF344rV   Tc                *    t        t        |            S rl   )r-   ry   )rh   r  r   checkwrap_negs        rT   indirect_indexingzOpOverrides.indirect_indexing  s     "#c(++rV   c                D    t        t        |       j                   d      )Nz,: check_bounds should be handled by CSEProxyr   rI  r{   rh   r   r   loweruppers        rT   check_boundszOpOverrides.check_bounds  s'     "Dz""##OP
 	
rV   c                D    t        t        |       j                   d      )Nz$: load should be handled by CSEProxyr  rh   re   r  s      rT   r  zOpOverrides.load  s%    !Dz""##GH
 	
rV   Nc                D    t        t        |       j                   d      )Nz%: store should be handled by CSEProxyr  rh   re   r  rg   r\   s        rT   r  zOpOverrides.store  s'     "Dz""##HI
 	
rV   c                D    t        t        |       j                   d      Nz3: device_assert_async should be handled by CSEProxyr  rh   rk  rS   s      rT   device_assert_asynczOpOverrides.device_assert_async  %    !Dz""##VW
 	
rV   c                D    t        t        |       j                   d      )Nz/: store_reduction should be handled by CSEProxyr  rh   re   r  rg   s       rT   r  zOpOverrides.store_reduction  s%    !Dz""##RS
 	
rV   c                D    t        t        |       j                   d      )Nz): reduction should be handled by CSEProxyr  rh   r   	src_dtypereduction_typerg   s        rT   r  zOpOverrides.reduction!  s'     "Dz""##LM
 	
rV   c                D    t        t        |       j                   d      )Nz$: scan should be handled by CSEProxyr  rh   dtypes
combine_fnvaluess       rT   scanzOpOverrides.scan,  s'     "Dz""##GH
 	
rV   c                D    t        t        |       j                   d      )Nz$: sort should be handled by CSEProxyr  rh   r  r  stable
descendings        rT   sortzOpOverrides.sort9  s'     "Dz""##GH
 	
rV   c                D    t        t        |       j                   d      )Nz): bucketize should be handled by CSEProxyr  rh   r  
boundariesboundary_indicesindexing_dtyperightsortersorter_indicess           rT   	bucketizezOpOverrides.bucketizeD  s'     "Dz""##LM
 	
rV   c                D    t        t        |       j                   d      )Nz2: halide_clamp only implemented for Halide backendr  )rh   rg   r   r  s       rT   halide_clampzOpOverrides.halide_clampR  s%    !Dz""##UV
 	
rV   c                D    t        t        |       j                   d      )Nz): dot only implemented for Triton backendr  )rh   r1  rV  s      rT   dotzOpOverrides.dotW  s%    !Dz""##LM
 	
rV   rH   )constraintsr   is_purepackc               D    t        t        |       j                   d      )Nz<: inline_asm_elementwise only implemented for Triton backendr  )rh   asmr  r   r  r  r  s          rT   inline_asm_elementwisez"OpOverrides.inline_asm_elementwise\  s'     "Dz""##_`
 	
rV   c                D    t        t        |       j                   d      )Nz.: ops.output should not appear at codegen timeAssertionErrorrI  r{   rp   s     rT   r  zOpOverrides.outputi  s%    Dz""##QR
 	
rV   c                D    t        t        |       j                   d      )Nz3: ops.placeholder should not appear at codegen timer  rh   r  s     rT   r  zOpOverrides.placeholdern  s%    Dz""##VW
 	
rV   c                0     d fd} |_         d|_        |S )Nc                J    t        t        |       j                   d       )Nz does not implement ops.r  )rh   rq   r  re   s      rT   unimplementedz1OpOverrides._unimplemented.<locals>.unimplementedu  s*    %:&&''?vF rV   T)rh   r{  rq   r	   r  r	   rv   rp  )r{   is_unimplemented)re   r  s   ` rT   _unimplementedzOpOverrides._unimplementeds  s     	
 "&)-&rV   c                p    t        | |d       }t        t        |d       }| xs ||k(  xs t        |dd      S )Nr  F)getattrr3   )r
  re   fn
default_fns       rT   _is_unimplementedzOpOverrides._is_unimplemented~  s?    S$%Zt4
vSz)SWR9KU-SSrV   c                P   |dv sJ |       t         j                         D ]  \  }}t        ||      }|/| j                  |      s&t	        | || j                  |             C|| j                  vsJ d| d| j                          ||_        t	        | |t        |              y )N)rl  rj  cppvecrk  rw  zmultiple definitions of z on )	pointwise_overrides_datar  r  r  setattrr  __dict__r{   r   )r
  r  funcnamedataimpls        rT   _initialize_pointwise_overridesz+OpOverrides._initialize_pointwise_overrides  s    EEMvME6<<> 
	;NHd4(D|((2C3+=+=h+GHs||3 .xjS\\NK3 !)X|D'9:
	;rV   )rv  rp  rv   rp  )rg   zUnion[bool, float, int]r   r   rv   rp  rq  )r   rp  rv   rp  )r1  rp  rV  rp  rv   rp  rr  )re   ry   r   rp  rv   rp  TT)
r  rp  r   Union[sympy.Expr, int]r  r   r  r   rv   sympy.Symbol
r   r   r   r   r  r   r  r   rv   rw   )re   ry   r  r   rv   rp  rl   )
re   ry   r  r   rg   rp  r\   r6   rv   rw   rk  r}  rS   ry   rv   rw   )re   ry   r  r   rg   rp  rv   rw   )
r   r   r  r   r  r5   rg   !Union[OpVarT, tuple[OpVarT, ...]]rv   r  )r  tuple[torch.dtype, ...]r  zFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]r  tuple[OpVarT, ...]rv   r  )
r  r  r  r  r  r   r  r   rv   r  NN)r  rp  r  .tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  rp  r  r   r  r   r   Optional[tuple[str, sympy.Expr]]r  zOptional[OpVarT]rv   rp  )rg   rp  r   r   r  r   rv   rp  )r  rp  r  ry   r  r   r   r   r  r   r  r   rv   rp  )rq   rp  rv   rw   )r  r   rv   rp  )re   ry   rv   zCallable[..., OpVarT]re   ry   rv   r   )r  ry   rv   rw   )$r{   r|   r}   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r8  r  r  r  r  r  r  r  r   rV   rT   r{  r{    s1   	 	   * * . . B B B B B B C C C C ! ! 5 5 ,, %, 	,
 , 
,

&0
9=
FJ
	

 NR

 *
39
AJ
	




	
	
 	
 &		

 1	
 
+	

'


 #
 

	
'	
 #	
 		

 	
 
	
$ 48+/

 C
 !	

 $
 
 1
 )
 




 &*"]]

 
 #	

 
 
 
 





   T T
 ; ;rV   r{  c                  |    e Zd ZU ded<   ded<   dZded<   dZded<   ej                  Zd	ed
<   dZ	ded<   dZ
ded<   y)OverridesDatary   re   r   rj  NzOptional[Callable[..., str]]rl  r  r   type_promotion_kindrk  rw  )r{   r|   r}   r   rl  r  r   DEFAULTr  rk  rw  r   rV   rT   r  r    sQ    
I	+/F(/+/F(/'// 8  ,0F(/(,C	%,rV   r  airy_aic                    d|  dS )Nzairy_ai_forward(r   r   r0  s    rT   rq  rq    s    (1- rV   special_airy_ai)r  rj  re   	bessel_j0c                    d|  dS )Nzbessel_j0_forward(r   r   r0  s    rT   rq  rq        *1#Q/ rV   c                    d|  dS )Nzlibdevice.j0(r   r   r0  s    rT   rq  rq        =1- rV   special_bessel_j0)r  rj  rl  re   	bessel_j1c                    d|  dS )Nzbessel_j1_forward(r   r   r0  s    rT   rq  rq    r  rV   c                    d|  dS )Nzlibdevice.j1(r   r   r0  s    rT   rq  rq    r   rV   special_bessel_j1	bessel_y0c                    d|  dS )Nzbessel_y0_forward(r   r   r0  s    rT   rq  rq    r  rV   c                    d|  dS )Nzlibdevice.y0(r   r   r0  s    rT   rq  rq    r   rV   special_bessel_y0	bessel_y1c                    d|  dS )Nzbessel_y1_forward(r   r   r0  s    rT   rq  rq    r  rV   c                    d|  dS )Nzlibdevice.y1(r   r   r0  s    rT   rq  rq    r   rV   special_bessel_y1digammac                    d|  dS )Nzcalc_digamma(r   r   r0  s    rT   rq  rq    s    aS* rV   c                    |  dS )Nz
.digamma()r   r0  s    rT   rq  rq    s    A3j) rV   )r  rj  r  re   r=  c                    d|  dS )Nzcalc_erfcx(r   r   r0  s    rT   rq  rq        A3a( rV   c                    d|  dS )Nzlibdevice.erfcx(r   r   r0  s    rT   rq  rq    s    +A3a0 rV   special_erfcxrX  c                    d|  d| d| dS )Nz	std::fma(r   r   r   rU  s      rT   rq  rq    s    is"QCr!A6 rV   c                    d|  d| d| dS )Nzfmadd(r   r   r   rU  s      rT   rq  rq    s    s"QCr!A6 rV   c                    d|  d| d| dS )Nzlibdevice.fma(r   r   r   rU  s      rT   rq  rq    s    s"QCr!A> rV   )r  rj  r  rl  re   igammac                    d|  d| dS Nzcalc_igamma(r   r   r   r  s     rT   rq  rq        <s"QCq1 rV   igammacc                    d|  d| dS Nzcalc_igammac(r   r   r   r  s     rT   rq  rq        =2aS2 rV   gammaincc                    d|  d| dS r  r   r  s     rT   rq  rq    r  rV   special_gammainc	gammainccc                    d|  d| dS r  r   r  s     rT   rq  rq    r  rV   special_gammaincci0c                    d|  dS )Nzcalc_i0(r   r   r0  s    rT   rq  rq        1o rV   c                    d|  dS Nzlibdevice.cyl_bessel_i0(r   r   r0  s    rT   rq  rq        3A3a8 rV   c                    |  dS )Nz.i0()r   r0  s    rT   rq  rq    s    A3e rV   )r  rj  rl  r  re   i0ec                    d|  dS )Nz	calc_i0e(r   r   r0  s    rT   rq  rq        	!A& rV   c                    |  dS )Nz.i0e()r   r0  s    rT   rq  rq    s    A3f rV   special_i0ei1c                    d|  dS )Nzcalc_i1(r   r   r0  s    rT   rq  rq    r(  rV   c                    d|  dS Nzlibdevice.cyl_bessel_i1(r   r   r0  s    rT   rq  rq    r+  rV   
special_i1i1ec                    d|  dS )Nz	calc_i1e(r   r   r0  s    rT   rq  rq    r/  rV   special_i1elog_ndtrc                    d|  dS )Nzcalc_log_ndtr(r   r   r0  s    rT   rq  rq    s    qc+ rV   special_log_ndtrmodified_bessel_i0c                    d|  dS )Nzmodified_bessel_i0_forward(r   r   r0  s    rT   rq  rq        3A3a8 rV   c                    d|  dS r*  r   r0  s    rT   rq  rq    r+  rV   special_modified_bessel_i0modified_bessel_i1c                    d|  dS )Nzmodified_bessel_i1_forward(r   r   r0  s    rT   rq  rq    r?  rV   c                    d|  dS r5  r   r0  s    rT   rq  rq    r+  rV   special_modified_bessel_i1modified_bessel_k0c                    d|  dS )Nzmodified_bessel_k0_forward(r   r   r0  s    rT   rq  rq    r?  rV   special_modified_bessel_k0modified_bessel_k1c                    d|  dS )Nzmodified_bessel_k1_forward(r   r   r0  s    rT   rq  rq    r?  rV   special_modified_bessel_k1ndtrc                    d|  dS )Nz
calc_ndtr(r   r   r0  s    rT   rq  rq  %  s    
1#Q' rV   special_ndtrndtric                    d|  dS )Nzcalc_ndtri(r   r   r0  s    rT   rq  rq  *  r  rV   special_ndtri	polygammac                *    |  d| d|  d| d| d|  dS )Nz == 0 ? calc_digamma(z) : (z == 1 ? trigamma(z) : calc_polygamma(r   r  r   r  s     rT   rq  rq  /  s8    S%aSaS0A!DWXYWZZ\]^\__ab rV   scaled_modified_bessel_k0c                    d|  dS )Nz"scaled_modified_bessel_k0_forward(r   r   r0  s    rT   rq  rq  7      :1#Q? rV   !special_scaled_modified_bessel_k0scaled_modified_bessel_k1c                    d|  dS )Nz"scaled_modified_bessel_k1_forward(r   r   r0  s    rT   rq  rq  <  rV  rV   !special_scaled_modified_bessel_k1spherical_bessel_j0c                    d|  dS )Nzspherical_bessel_j0_forward(r   r   r0  s    rT   rq  rq  B  s    4QCq9 rV   special_spherical_bessel_j0zetac                    d|  d| dS )Nzzeta(r   r   r   r  s     rT   rq  rq  G  s    52aS* rV   special_zetachebyshev_polynomial_tc                    d|  d| dS )Nzchebyshev_polynomial_t_forward(r   r   r   r  s     rT   rq  rq  L      :1#Rs!D rV   special_chebyshev_polynomial_tchebyshev_polynomial_uc                    d|  d| dS )Nzchebyshev_polynomial_u_forward(r   r   r   r  s     rT   rq  rq  Q  rc  rV   special_chebyshev_polynomial_uchebyshev_polynomial_vc                    d|  d| dS )Nzchebyshev_polynomial_v_forward(r   r   r   r  s     rT   rq  rq  V  rc  rV   special_chebyshev_polynomial_vchebyshev_polynomial_wc                    d|  d| dS )Nzchebyshev_polynomial_w_forward(r   r   r   r  s     rT   rq  rq  [  rc  rV   special_chebyshev_polynomial_wlegendre_polynomial_pc                    d|  d| dS )Nzlegendre_polynomial_p_forward(r   r   r   r  s     rT   rq  rq  `      9!BqcC rV   special_legendre_polynomial_pshifted_chebyshev_polynomial_tc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_t_forward(r   r   r   r  s     rT   rq  rq  e      B1#Rs!L rV   &special_shifted_chebyshev_polynomial_tshifted_chebyshev_polynomial_uc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_u_forward(r   r   r   r  s     rT   rq  rq  j  rt  rV   &special_shifted_chebyshev_polynomial_ushifted_chebyshev_polynomial_vc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_v_forward(r   r   r   r  s     rT   rq  rq  o  rt  rV   &special_shifted_chebyshev_polynomial_vshifted_chebyshev_polynomial_wc                    d|  d| dS )Nz'shifted_chebyshev_polynomial_w_forward(r   r   r   r  s     rT   rq  rq  t  rt  rV   &special_shifted_chebyshev_polynomial_whermite_polynomial_hc                    d|  d| dS )Nzhermite_polynomial_h_forward(r   r   r   r  s     rT   rq  rq  y  s    82aSB rV   special_hermite_polynomial_hhermite_polynomial_hec                    d|  d| dS )Nzhermite_polynomial_he_forward(r   r   r   r  s     rT   rq  rq  ~  rp  rV   special_hermite_polynomial_helaguerre_polynomial_lc                    d|  d| dS )Nzlaguerre_polynomial_l_forward(r   r   r   r  s     rT   rq  rq    rp  rV   special_laguerre_polynomial_lzdict[str, OverridesData]r  c                     t         fdt        j                  j                  t        j                  j                  t        j                  j
                  t        j                  j
                  fD              S )Nc              3  &   K   | ]  }|v  
 y wrl   r   )r  r1  re   s     rT   r  z$is_buffer_removed.<locals>.<genexpr>  s       		   )anyr7   r   removed_bufferskernelinplaced_to_removere   s   `rT   is_buffer_removedr    sU      GG##HH$$GG&&HH''	
  rV   c                  4     e Zd ZdZd fdZddZddZ xZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersc                V    t         |   |       || _        t        |t              rJ y rl   )r  r   re   r3  r&   )rh   re   liner  s      rT   r   zDeferredLine.__init__  s+    	d$45555rV   c                F    t        | j                        s| j                  S y rl   )r  re   r  rm   s    rT   __call__zDeferredLine.__call__  s     +99rV   c                .    t        | j                  |      S rl   )r  re   )rh   r  s     rT   	_new_linezDeferredLine._new_line  s    DIIt,,rV   )re   ry   r  ry   r   )r  ry   rv   r  )r{   r|   r}   r~   r   r  r  r&  r'  s   @rT   r  r    s    R6

-rV   r  c                      e Zd ZdddZy)BracesBufferc                H     t         j                  d fd       } |       S )Nc               3    K   t              D ](  } j                  d       xj                  dz  c_        * t               D ](  } xj                  dz  c_        j                  d       * d  t               D ](  } j                  d       xj                  dz  c_        * t              D ](  } xj                  dz  c_        j                  d       * y w)N{rH   })ranger  _indent)_r   rh   s    rT   ctxz BracesBuffer.indent.<locals>.ctx  s     6] "s#!" F7^ $!s#$ F7^ "s#!" 6] $!s#$s   C C#)rv   Iterator[None])
contextlibcontextmanager)rh   r   r  s   `` rT   indentzBracesBuffer.indent  s$    		"	"	$ 
#	$ urV   N)rH   )r   r   rv   z'contextlib.AbstractContextManager[None])r{   r|   r}   r  r   rV   rT   r  r    s    rV   r  c                  "    e Zd ZU ded<   ded<   y)InplacedBufferry   r   r   other_namesNr   r   rV   rT   r  r    s    OrV   r  c                  .    e Zd ZU ded<   dZded<   ddZy)	ArgNamery   re   Fr   is_constexprc                B    | j                    | j                  rd S d S )Nz : tl.constexprr  )re   r  rm   s    rT   	full_namezArgName.full_name  s*    ))$2C2C.LMMLMMrV   Nrx   )r{   r|   r}   r   r  r  r   rV   rT   r  r    s    
IL$NrV   r  c                      e Zd ZddZy)
RemovedArgc                     y)NREMOVEDr   rm   s    rT   __str__zRemovedArg.__str__  s    rV   Nrx   )r{   r|   r}   r  r   rV   rT   r  r    s    rV   r  c                     e Zd Ze	 	 	 	 	 	 	 	 dd       ZddZddZedd       ZddZddZ	ddZ
ej                  f	 	 	 	 	 	 	 ddZdd	Zdd
ZddZd dZd!dZd"dZd#dZ	 d$	 	 	 d%dZ	 	 d&dZd'dZd(dZd)dZy)*
KernelArgsc                ~    |j                  |t              }t        |t              r|  t	        |       x||<   }|S |S rl   )rX  r  r3  r  r  )r   odictre   result
new_results        rT   _lookupzKernelArgs._lookup  sD     */4)Afj)*0#e*'>>E$K*rV   c                J    i | _         i | _        i | _        i | _        g | _        y rl   )input_buffersoutput_buffersinplace_buffersr  workspace_argsrm   s    rT   r   zKernelArgs.__init__  s)    -/ACMO/124rV   c                    dj                  dj                  t        t        | j                  | j
                  | j                  | j                  g                  S )NzKernelArgs({})r   )formatr   mapr  r  r  r  r  rm   s    rT   __repr__zKernelArgs.__repr__  sS    &&II**++,,	

 	
rV   c                "    t        | t              S rl   )r3  r  r  s    rT   _buffer_is_marked_removedz$KernelArgs._buffer_is_marked_removed  s     $
++rV   c                :   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v rt        t        | j                  |         S || j                  v r't        t        | j                  |         j                  S |j                  d      r| j                  d| j                  |      S | j                  d| j                  |      S )Nseedin_ptr)r7   r   r  mutation_real_namerX  r  r  r
   ry   r  r  r   r   r  r  r  s     rT   inputzKernelArgs.input  s    7777$$77;;D$GD1772228D824&&&T006774'''(<(<T(BCNNN??6"<<(:(:DAA||Hd&8&8$??rV   c                   t         j                  j                  r4t         j                  j                  j                  j	                  ||      }|t         j                  j
                  vsJ |       || j                  v r't        t        | j                  |         j                  S | j                  d| j                  |      S )Nout_ptr)r7   r   r  r  rX  r  r  r
   r  r   r  r  r  s     rT   r  zKernelArgs.output  s    7777$$77;;D$GD1772228D824'''(<(<T(BCNNN||It':':DAArV   c                   |t         j                  j                  v r)t         j                  j                  j                  |       || j                  vsJ |       || j                  v rL| j                  |   }t        |t              rJ |j                  j                  |       || j                  |<   y | j                  j                         D cg c]  }t        |t              s| }}| j                  j                         D cg c]  }t        |t              r| }}t        t        |            t        |      z   }t        d| ||g      }|| j                  |<   || j                  |<   y c c}w c c}w )N
in_out_ptr)r7   r   unaligned_buffersrK  r  r3  r  r  appendr  r  r0   r  )rh   
input_nameoutput_namebufvalalive_buffersr  inplace_buffer_idxs           rT   make_inplacezKernelArgs.make_inplace  sk   222GG%%))+6$"6"66CC6---&&z2C!#z222OO"";/03D  -  //668!#z2 M   //668c:. O 
 "%VM%:!;c/>R!R /01[)C 03D  ,03D  -!
s   E3E8c                x   t        |t        j                  |      t        j                  j                         t         j                         |      }t        | j                        D ]  \  }}t         j                  ||      rJ|j                  }t         j                  ||      | j                  |<   |j                  |j                  |fc S |j                  |j                  k7  r|j                  |j                  k7  rJ |        | j                  j                  |       |j                  |j                  dfS )aZ  
        Allocate or extend a workspace buffer of nelem elements.

        This function manages the allocation of a workspace buffer. It either creates
        a new WorkspaceArg or extends an existing one.

        Note:
        - Calling this function will in-place mutate the args by adding or updating
        a WorkspaceArg.
        - The codegen for generating the Python argdefs and call_defs will check
        this field and allocate the buffer accordingly.
        - A new argument "ws_ptr" will be present in the generated code.

        Args:
            nelem (sympy.Expr): The number of elements to allocate.
            zero_fill (bool): Whether to initialize the buffer to zero.
            dtype (torch.dtype): the dtype of the workspace tensor

        Returns:
            Tuple[str, str, int]: A tuple containing:
                - "ws_ptr": A string identifier for the workspace pointer.
                - "workspace_{i}": agraph level unique identifier for
                    the workspace tensor.
                - offset: An integer representing the item offset in the workspace.
        )r   r   r   r   r   r   )r   r   r   r7   r   get_current_device_or_throwr   ru  r  r   r   r   r   r   r  )rh   nelemr   r   argrw  existing_argr   s           rT   r,  zKernelArgs.workspace*  s   8 '11)<77668#//1
  ))<)<= 	OA|$$\37%++)5):):<)M##A&#..0G0GOO''3>>9 ++s~~= >	 	""3'~~s~~q00rV   c           
        t         j                  j                         }t        |t        j
                  t        j                  dd|j                   d|j                   |      }| j                  D ]*  }|j                  |j                  k(  s||k(  r#J ||f        | j                  j                  |       |j                  S )a  
        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

        Warning: multiple calls to this function will return the same buffer.

        Args:
            min_size: the number of int32 semaphores required

        Returns:
            name of the semaphores buffer
        sem_ptrsemaphores_r  )r   r   r   r   r   r   )r7   r   r  r   r   r   r   uint32rI  r  r  r   r  )rh   min_sizecurrent_devicer  r  s        rT   
semaphoreszKernelArgs.semaphoresY  s     <<>'66,, $^%8%8$9>;O;O:PQ!
 !// 	@L&&#..8l*?S,,??*	@ 	""3'~~rV   c                f   t        |t              sJ t        |      |f       t        j                  |      }|| j
                  v r| j
                  |   S | j
                  j                         v r0 t        fd| j
                  j                         D               | j
                  |<   S )Nc              3  F   K   | ]  }|j                        sd   yw)rH   N)r   )r  r  re   s     rT   r  z)KernelArgs.seed_offset.<locals>.<genexpr>}  s     U1!,,tBTQUs   !!)r3  r   rI  r   r  r  r  sum)rh   re   rg   s    ` rT   seed_offsetzKernelArgs.seed_offsetu  s    %%;UU';;%e$DMM!==''4==''))&U(<(<(>UUVW   $erV   c                    t        |t        j                        sJ t        |      |f       |j                  dk(  rd| j
                  |<   y| j                  d| j
                  |      S )Nr  ks)r3  r   SymbolrI  re   r  r  r  s     rT   r   zKernelArgs.size  sX    $-AT
D/AA-99"(DMM$||D$--66rV   c                    t        | j                  j                         | j                  j                         | j                  j                               S rl   )r   r  keysr  r  rm   s    rT   
call_nameszKernelArgs.call_names  sA    ##%t':':'?'?'A4==CUCUCW
 	
rV   c                   | j                   j                  |d      }|t        |t              s|j                  S | j
                  j                  |d      }|t        |t              s|S | j                  j                  |d      S )z;
        Returns inner name of a given outer name.
        N)r  rX  r3  r  r   r  r  )rh   re   inplacedr  s       rT   arg_namezKernelArgs.arg_name  s}     ''++D$7
8Z(H&&&))--dD9":k:+N!!%%dD11rV   c                    |S rl   r   )rh   r  r   s      rT   wrap_ptr_argzKernelArgs.wrap_ptr_arg  s    
rV   c                    t        |      S rl   r  )rh   r   s     rT   wrap_size_argzKernelArgs.wrap_size_arg  s    4yrV   Nc                   ddl m} |ddl m} |}g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  d   }|j                  }	t        j                  j                  |      }
||
   }|j                  | d|	        |j                  | j                  ||
             |j                  | d        | j                  j!                         D ]  \  }}	|| j                  v rt        j                  j                  |      }
||
   }|j                  d| d|	        |j                  | j                  ||
             |j                  d| d        | j"                  j!                         D ]  \  }}|| j                  v st        |t              r%t        j                  j                  |      }
||
   }|j                  | d|        |j                  | j                  ||
             |j                  | d        | j$                  j!                         D ]  \  }}	t        |t&        j(                        r@t+        |t,        j.                        r&|j                  d|	        |j                  d	       n+|j                  d| d
|	        |j                  d|        |j                  | j1                  |             t        j                  j2                  st        j                  j2                  j5                  |        | j6                  rJ d       |||fS )NrH   )
INDEX_TYPE)r  r  z* *zconst zconst float zconst float zWorkspace not supported on CPU )r  r  r  r0   r  r  r3  r  r  r   r7   r   r   r  r  r  r  r  r  r   r  r   r   UNBACKED_FLOATr  wrapper_codeensure_size_computedr  )rh   dtype_to_cpp_typer  r  	call_argsarg_defs	arg_typesr  outerinnerr   	cpp_dtypemaybe_inners                rT   cpp_argdefszKernelArgs.cpp_argdefs  s    	*$/ ,		t33::<= 		.H(J/((,E''EGG%%e,E)%0IOOykE734T..ue<=	{!_-		. !..446 	4LE5,,,GG%%e,E)%0IOOfYKr%9:T..ue<=vi[23	4 #'"5"5";";"= 	.E;,,,
;
0SGG%%e,E)%0IOOykK=9:T..ue<=	{!_-	. !MM//1 	ALE5%.>++4 ,ug 67  /&AeW =>  6*!67T//67ww##$$99%@	A &&I(II&I--rV   c                   g }g }g }g }t        | j                  j                               D ]  }t        |t              r|j                  t        |j                               |j                  |j                  d          |j                  t        j                  j                  |j                  d                |j                  t        |j                  |j                  d   t        j                  j                  |j                  d                       t        | j                  j                         | j                   j                               D ]  \  }}|| j                  v st        |t              r%|j                  t        |             |j                  |       |j                  t        j                  j                  |             |j                  t        ||t        j                  j                  |                    | j"                  j                         D ]  \  }}|j                  t        |             |j                  |       |j                  t%        |             |j                  t'        ||             t        j                  j(                  st        j                  j(                  j+                  |        | j,                  D ]m  }|j                  t        |j                               |j                  |j.                         |j                  |       |j                  |j0                         o ||||fS )Nr  )re   r   r   )r0   r  r  r3  r  r  r  r   r  r7   r   r   r   r   r  r  r  r  rI  r   r  r  r  r   r   )	rh   r  r  r  precompile_argsr  r   r  r  s	            rT   python_argdefszKernelArgs.python_argdefs  s    #%!	!	/1t33::<= 	H(J/OOGH$7$789X11"56QWW..x/C/CB/GHI""!,,#//3''++H,@,@,DE	 "$$&%%'
 	LE5
 ,,,
5*0MOOGEN+U#QWW..u56"" ''++E2	" !MM//1 	ALE5OOGEN+U#T%[)""75%#89ww##$$99%@	A && 	(COOGCNN34S^^,""3'SYY'		(
 OY>>rV   c              #    K   t        | j                  j                               D ]  }t        |t              r|j
                  D ]  }|t        j                  j                  v s|t        j                  j                  v r<|| j                  v r| j                  |   |j                  f || j                  v svt        t        | j                  |         |j                  f   y wrl   )r0   r  r  r3  r  r  r7   r   r  r  r  r   r  r
   ry   )rh   r  others      rT   aliaseszKernelArgs.aliases	  s     t33::<= 	UH(J/!-- 	UQWW777 ; ;;D...,,U3X5H5HHHD///sD$7$7$>?ATATTT	U	Us   B9C,<0C,c                    t        | j                  j                  |t              t              xr. t        | j
                  j                  |t              t              S rl   )r3  r  rX  r  r  r  r  s     rT   
is_removedzKernelArgs.is_removed  sK    ##D'2J
 N--11$@*M	NrV   c                l   t               }t        | j                  j                               D ]1  }t	        |t
              r|j                  |j                  d          3 | j                  j                         D ]5  \  }}|| j                  v st	        |t
              r%|j                  |       7 |S )Nr  )
r   r0   r  r  r3  r  rK  r  r  r  )rh   	live_outsr  r   r  s        rT   live_output_bufferszKernelArgs.live_output_buffers   s    %/\	t33::<= 	4H(J/MM(..r23	4 !//557 	!LE5,,,
5*0MMM% 	! rV   )r   ry   r  z6Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]]re   rK   rv   ry   ru   rx   )re   r	   rv   r   r.  )r  ry   r  ry   rv   rw   )r  r   r   r   r   r   rv   ztuple[str, str, int])r  r   rv   ry   )re   ry   rg   r   rv   ry   )re   r  rv   ry   )rv   zIterator[str])re   ry   rv   r   )r  ry   r   r   rv   ry   )r   
SymbolLikerv   ry   rl   )r  z Optional[dict[torch.dtype, str]]rv   z&tuple[list[str], list[str], list[str]])rv   z?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]])rv   zIterator[tuple[str, str]]r  )rv   zOrderedSet[str])r{   r|   r}   r   r  r   r  r  r  r  r  r   r   r,  r  r  r   r  r  r  r  r  r  r
  r  r  r   rV   rT   r  r    s    		E	 	 
		 	5
 , ,
@B4: HM{{-1-1,0-19D-1	-1^87


2 EI4.!A4.	/4.l1?	H1?fUN
rV   r  c                  `     e Zd ZdZ	 	 d	 	 	 	 	 	 	 d	 fdZd
dZddZddZddZd
dZ	 xZ
S )r}  aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    c                    t         |           t        |t              sJ t	        |             || _        || _        d| _        || _        || _	        y r-  )
r  r   r3  r   rI  re   bounds	use_countr   r  )rh   re   r  r   r  r  s        rT   r   zCSEVariable.__init__4  sL     	&+.<V<.	

rV   c                    | j                   S rl   r  rm   s    rT   r  zCSEVariable.__str__C  s    yyrV   c                ,    t        | j                        S rl   )hashre   rm   s    rT   __hash__zCSEVariable.__hash__F  s    DIIrV   c                X    t        |t              xr |j                  | j                  k(  S rl   )r3  r}  re   )rh   r	  s     rT   __eq__zCSEVariable.__eq__I  s!    %-I%**		2IIrV   c                     y rl   r   )rh   re   rq   r  s       rT   update_on_argszCSEVariable.update_on_argsL  s    rV   c                N    | j                   j                   d| j                  dS r  )r  r{   re   rm   s    rT   r  zCSEVariable.__repr__O  s$    ..))*!DII=::rV   r  )re   ry   r  ValueRanges[Any]r   r   r  rG   rx   )rv   r   )r	  objectrv   r   )re   ry   rq   r	   r  r	   rv   rw   )r{   r|   r}   r~   r   r  r  r  r  r  r&  r'  s   @rT   r}  r}  -  sU     (, $ ! %	
 J;rV   r}  AugmentedKeyT)default)boundr!  .c                  P   e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 ddZddZddZddZddZddZ	dd	Z
dd
ZddZ ej                         ddddd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 ddZ ej                         ddf	 	 	 	 	 	 	 	 	 ddZy)CSEz Common subexpression eliminationNc                    || _         || _        i | _        || _        |xs i | _        |xs i | _        |xs t        j                         | _        t               | _
        |xs i | _        y rl   )r   r]   _cachename_prefixstore_cachereduction_cache	itertoolsr   iter_buffer_idsr   invalidated_storesvarname_map)rh   r   r]   r'  iter_buffersr(  r)  r-  s           rT   r   zCSE.__init__a  sm     FH&ALARPR!r 	 6B5VY__EV3=<7B7HbrV   c                6   g | j                   j                         D ]2  \  }}||vs| j                   |= | j                  j                  |       4 |r9| j                  j                         D ci c]  \  }}||v s|| c}}| _        y i | _        y c c}}w rl   )r(  r  r,  rK  r&  )rh   	keep_varsre   tmpr  r  s         rT   
invalidatezCSE.invalidatey  s    44++1134 	2ID#)#$$T*''++D1	2 ,0KK,=,=,?RDAq1	>1a4RDKDK Ss   1B>Bc           	          t        |       | j                  | j                  | j                  | j                  | j
                  | j                  | j                        S )N)r   r]   r'  r.  r(  r-  r)  )rI  r   r]   r'  r+  r(  r-  r)  rm   s    rT   clonez	CSE.clone  sP    tDz;;;;((--(((( 00
 	
rV   c                    | j                         }t        | j                        |_        t        | j                        |_        t        | j                        |_        |S )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)r4  r+   r&  r)  r(  )rh   new_cses     rT   scoped_copyzCSE.scoped_copy  sH    **,#DKK0",T-A-A"B()9)9:rV   c                "    t        t        |      S )z@Override this method to augment cache key with backend specifics)r
   r   rh   	cache_keys     rT   augment_keyzCSE.augment_key  s    M9--rV   c                @    || j                   | j                  |      <   y rl   r&  r;  )rh   r:  r  s      rT   putzCSE.put  s    36D$$Y/0rV   c                <    | j                  |      | j                  v S rl   )r;  r&  r9  s     rT   containszCSE.contains  s    	*dkk99rV   c                X    | j                   j                  | j                  |      d       S rl   )r&  rX  r;  r9  s     rT   try_getzCSE.try_get  s"    {{t//	:DAArV   c                >    | j                   | j                  |         S rl   r=  r9  s     rT   rX  zCSE.get  s    {{4++I677rV   T)r  rf   
assignmentr   r  c          	        t        |t              r|j                  }|s|sJ t        |t              rE|j                  j                  |      |_        |xj                  dz  c_        t        t        |      S t        |t              r|j                         }n1t        |t              r|j                  }nt        |t              sJ |}| j                  |      }	||sd}|	s| j                  |||      }	| j!                  ||	       |rt"        j$                  j&                  r+t"        j$                  j&                  j)                  |d       t        |t              rP|r |j+                  | j,                   |	 d       |j/                  |       |j+                  | j0                         |	S t        |t              rM|sJ |j+                  |j3                  | j,                   |	 d|j                   | j0                                |	S |r | j,                   |	 d| | j0                   }
n| | j0                   }
|j+                  |
       |rPt4        j6                  j8                  st4        j6                  j:                  r|t=               dk7  rt?        ||	|       |	S |	j                  j                  |      |	_        |	xj                  dz  c_        |	S )NrH   r   T)	only_oncez =z = rj  ) r3  r4   rg   r}  r  tightenr  r
   r  r)   getvaluer&   r  ry   rB  newvarr>  r7   r  current_nodecodegen_originating_infor  r   splicer]   r  r   r  r  r  r(   r  )rh   r   r   r  rf   rD  r   r  r:  r  r  s              rT   generatezCSE.generate  sl    dH%::D
""dK( ++--f5DKNNaN..n-I./		IdC(((Ill9%= E++feU3CHHY$88((HH))BB$ C  dN3!((DKK=R)@AMM$'$$T[[1: 
9  &67%%:$$$++se3tyyk$++'WX4 
- ""&++se3tfT[[MJ"&}5$$T* #"//KK%22JJ!-/1U:#FC7 
 ++F3CJMMQM
rV   c                    | j                    t        | j                         }t        j                  j                  ||||      }|| j                  |<   |S rl   )r'  r   r+  r7   r  create_cse_varr-  )rh   r  r   r  var_namer  s         rT   rI  z
CSE.newvar  sT     &&'T-A-A(B'CDhh%%huE%("
rV   c                    t        j                  | j                  vfd       t        j                  j                  |||      }|| j                  <   |S )Nc                     d  S )Nzduplicate name: r   r  s   rT   rq  zCSE.namedvar.<locals>.<lambda>  s    4DTF2K rV   )r   _check_valuer-  r7   r  rO  )rh   re   r  r   r  r  s    `    rT   namedvarzCSE.namedvar  sU     	(((*K	
 hh%%dFE5A!$
rV   )r  r  r1  NNNN)r   ry   r]   ry   r'  ry   r.  zOptional[itertools.count[int]]r(  z.Optional[MutableMapping[str, CSEVariableType]]r)  z<Optional[MutableMapping[ReductionCacheKey, CSEVariableType]]r-  z$Optional[dict[str, CSEVariableType]])r0  zOrderedSet[CSEVariable]rv   rw   rv   r   )r:  ry   rv   r   )r:  ry   r  r  rv   rw   )r:  ry   rv   r   )r:  ry   rv   zOptional[CSEVariableType])r:  ry   rv   r  )r   r)   r   zCUnion[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase]r  r  rf   r   rD  r   r   r   r  rG   rv   r  )r  r  r   r   r  rG   rv   r  )
re   ry   r  r  r   r   r  rG   rv   r  )r{   r|   r}   r~   r   r2  r4  r7  r;  r>  r@  rB  rX  r   unknownrM  rI  rT  r   rV   rT   r$  r$  ^  s   *  7;FJ <@II I 	I
 5I DI
I :I0	
.7:B8 $7;#6#6#8'+ $KK RK
 !K K K %K K 
K^ $7;#6#6#8'+ $		 	 %	 		
 
	 $7;#6#6#8'+ $ ! %	
  
rV   r$  c                  0     e Zd Zd fdZddZddZ xZS )CodeGenc                T    t         |           t        j                         | _        y rl   )r  r   r  	ExitStack
exit_stackrh   r  s    rT   r   zCodeGen.__init__  s    $..0rV   c                :    | j                   j                          | S rl   )r[  	__enter__rm   s    rT   r^  zCodeGen.__enter__  s    !!#rV   c                >    | j                   j                  |||       y rl   )r[  __exit__)rh   exc_typeexc_valexc_tbs       rT   r`  zCodeGen.__exit__  s      7F;rV   ru   rU  ra  r	   rb  r	   rc  r	   rv   rw   )r{   r|   r}   r   r^  r`  r&  r'  s   @rT   rX  rX    s    1<rV   rX  c                  <    e Zd ZU dZded<   dZded<   dZded<   	 d"	 	 	 	 	 d# fdZej                  d$d	       Z
ej                  	 	 d%	 	 	 	 	 	 	 d&d
       Zd'dZd'dZd(dZ	 d)	 	 	 	 	 	 	 	 	 d*dZd+dZ	 	 	 	 	 	 	 	 	 	 d,dZ	 	 	 	 	 	 	 	 	 	 d-dZ	 	 	 	 	 	 	 	 d.dZ	 	 	 	 	 	 	 	 	 	 d/dZd0dZ	 	 d%	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d1dZed2d       Z	 d)	 	 	 	 	 	 	 	 	 d3dZ	 	 	 	 	 	 	 	 	 	 d4dZd5dZd6 fdZd7 fdZd8dZd9dZd9dZ 	 	 	 	 d:dZ!d;d Z"d<d!Z# xZ$S )=Kernelr  ry   newvar_prefixr]   Nz'Optional[Callable[[], OpsHandler[Any]]]	overridesc                4   t         |           |rt        xj                  dz  c_        |xs
 t	               | _        t               | _        t               | _        t               | _	        d| _
        d| _        d| _        d| _        t        | j                  | j                         | _        t%               | _        t%               | _        d | _        d | _        d | _        d | _        t%               | _        t%               | _        i | _        d| _        d | _        y )NrH   Fr   )r  r   r    generated_kernel_countr  rq   r)   loadscomputestoresatomic_add_foundnum_load	num_storenum_reductionr$  rg  r]   cser   must_keep_buffersstore_buffer_names
_load_mask_load_otherrJ  node_to_boundsr  r  inplace_update_buffersmin_elem_per_threadkernel_name)rh   rq   increase_kernel_countr  s      rT   r   zKernel.__init__  s     	 **a/*(JL	#%
%'$& %.1$2D2Ddkk.R2<,3=<)-4859OS0:3=<
 79##$ *.rV   c              #     K   | j                   }|| _         |j                  j                         j                         | _        	 d  || _         y # || _         w xY wwrl   )rJ  r  r  
get_boundsrw  )rh   r  priors      rT   set_current_nodezKernel.set_current_nodeD  sO     !! "jj//1<<>	& %DDs   AAA A	AAc              #    K   ||}|d u x}r
t               }| j                  }| j                  }| j                  }| j                  }|| _        || _        || _        |j                         | _        	 d  || _        || _        || _        || _        |r
|rJ d       y y # || _        || _        || _        || _        |r
|rJ d       w w xY ww)Nz$unexpected store inside swap_buffers)r)   rk  rl  rm  rr  r7  )	rh   lbcbsbdisallow_storesrk  rl  rm  rr  s	            rT   swap_bufferszKernel.swap_buffersN  s      :B Dj(?(!B

,,hh
??$		FDJ"DL DKDHEEEv2  DJ"DL DKDHEEEv2 s   A/C2B 6)C*C		Cc                    t         rl   r  r  s      rT   r  zKernel.loadl  r	  rV   c                    | j                   }	 | j                  | _         | j                  ||      || _         S # || _         w xY w)z+A load the depends on an index we have read)rk  rl  r  )rh   re   r  r~  s       rT   indirect_loadzKernel.indirect_loado  s8    

	DJ99T5)DJDJs	   "8 	Ac                    t         rl   r  r  s       rT   r  zKernel.store_reductiony  r	  rV   c                    t         rl   r  r  s        rT   r  zKernel.store|  
     "!rV   c                D    t        t        |       j                   d      r  r  r  s      rT   r  zKernel.device_assert_async  r  rV   c                    t         rl   r  r  s        rT   r  zKernel.reduction  
     "!rV   c                    t         rl   r  )rh   re   r  rg   
extra_metas        rT   partial_accumulatezKernel.partial_accumulate  r  rV   c                    t         rl   r  r  s       rT   r  zKernel.scan  s
     "!rV   c                    t         rl   r  r  s        rT   r  zKernel.sort  r  rV   c                    t         rl   r  rm   s    rT   
var_rangeszKernel.var_ranges  r	  rV   c                    t         )z3
        See [Note: Inductor bucketize op]
        r  r  s           rT   r  zKernel.bucketize  s
     "!rV   c                    t         rl   r  rm   s    rT   assert_functionzKernel.assert_function  s    !!rV   c           	     v   t        |t              rt        |      }t        |t              sJ t        |             |t        |t              sJ |t        |t              sJ |r|rd| d| d| d| d	}| d| d| }n|r
| d| }|}n|sJ | d| }|}|r	d| d| d}| j                   d| d| dS )	Nr  z <= r  z < r   z) | ~(z, "index out of bounds: z"))r3  r}  ry   rI  r  )rh   r  r  r  maskrk  
cond_prints          rT   indirect_assertzKernel.indirect_assert  s    c;'c(C#s#.T#Y.#}
5# 666}
5# 666U ugT#eC5E7!<D!7$se3ug6JWD&DJL5U#eW%DJtfF4&*D&&'q.FzlRTUUrV   c                    t         rl   r  r  s        rT   r  zKernel.check_bounds  r  rV   c                    t         rl   r  r  s     rT   index_to_strzKernel.index_to_str  r	  rV   c           	     (   t         |           | j                  sJ | j                  j	                  t        j                  t        | | j                                            | j                  j	                  t        j                  |              | S rl   )	r  r^  rh  r[  enter_contextr7   set_ops_handlerCSEProxyset_kernel_handlerr\  s    rT   r^  zKernel.__enter__  sl    ~~~%%htT^^-=>?	
 	%%a&:&:4&@ArV   c                H    | j                          t        | 	  |||       y rl   )remove_kernel_local_buffersr  r`  )rh   ra  rb  rc  r  s       rT   r`  zKernel.__exit__  s     ((*7F3rV   c                   t         j                  j                  syt        fd| j                  D              }t               | j                  D ]c  }|| j
                  vs|| j                  j                  vs+j                  ||      s>| xj                  dz  c_	        j                  |       e D ]  }|| j                  j                  v rw| j                  j                  |   }t        |t              rEt        fd|j                  D              }|r| j!                  |       | j"                  j                  |       | j%                  |        y)z
        Any buffers that are both created and have a last use in the
        same kernel can be removed.

        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        Nc              3  t   K   | ]/  }|j                   v rj                   |   j                          1 y wrl   )name_to_bufdefining_op_name)r  r  r  s     rT   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>  s;      &
i+++ !!#&779&
s   58rH   c              3  &   K   | ]  }|v  
 y wrl   r   )r  r  names_to_removes     rT   r  z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>	  s     KaQ/1Kr  )r7   r   r  r   rt  rs  rq   r  $can_buffer_be_removed_through_fusionrp  rK  r  r3  r  r  r  remove_inplace_bufferr  remove_buffer)rh   fused_node_namesre   r  rd   r  r  s        @@rT   r  z"Kernel.remove_kernel_local_buffers  s3    GG%%	% &
..&
 

 ,6<++ 		*DD222		 7 77BB* !###D)		* $ 
	)Dtyy000ii//5c:.K3??KK..t4''++D1""4(
	)rV   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremove_buffer(%r))rC  rR   r  rq   r  r  rK  r  s     rT   r  zKernel.remove_buffer	  s;     			%t,)0		  &  &rV   c                    t         j                  d|       t        | j                  j                  |<   | j
                  j                  |       y )Nzremoving_inplace_buffer(%r))rC  rR   r  rq   r  r  rK  r  s     rT   r  zKernel.remove_inplace_buffer$	  s9    		/6*1		!!$'  &rV   c           
        t        |t        t        f      r|D cg c]  }| j                  |       c}S t        j
                  j                  j                  |      }t        |j                  d       }|D ci c]f  }t        |t        j                  t        j                  t        j                  t        j                  f      r|| j                   j#                  |      h }}t%        ||      S c c}w c c}w )Nc                    | j                   S rl   r  )ss    rT   rq  z(Kernel.rename_indexing.<locals>.<lambda>1	  s
    !&& rV   )r  )r3  listtuplerename_indexingr7   r   r  r  sortedfree_symbolsr   r   UNBACKED_INTSIZEPRECOMPUTED_SIZEr  rq   r   r.   )rh   r  r1  sorted_symbolsreplacementss        rT   r  zKernel.rename_indexing)	  s    
 edE]+5:;D((+;;  ))%0 2 28HI $
%%II))''	 tyy~~a  
 
 %..! <
s   C4;A+C9c                    t        |i |S rl   )r}  )rh   rq   r  s      rT   rO  zKernel.create_cse_varA	  s    D+F++rV   c                Z    |y| j                   j                  |j                               S )zC
        Returns arg name of a given input or output node.
        N)rq   r  r   )rh   r  s     rT   r  zKernel.arg_nameD	  s'     <yy!!$--/22rV   )NT)rq   zOptional[KernelArgs]r{  r   rv   rw   )r  rF   rv   r  r  )r  r)   r  Optional[IndentedBuffer]r  r  rv   r  re   ry   r  r   rv   r}  re   ry   r  r   rg   r}  rv   rw   rl   
re   ry   r  r   rg   r}  r\   r6   rv   rw   r  
r   r   r  r   r  r5   rg   +Union[CSEVariable, tuple[CSEVariable, ...]]rv   r  )
re   ry   r  r5   rg   r}  r  dict[str, Any]rv   rw   r  r  r  zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]rv   r  
r  r  r  r  r  r   r  r   rv   r  )rv   zdict[sympy.Symbol, sympy.Expr]r  r}  r  r  r  r}  r  r   r  r   r  r  r  zOptional[CSEVariable]rv   r}  rx   )
r  zUnion[CSEVariable, str]r  r   r  r   r  z!Optional[Union[CSEVariable, str]]rv   ry   r  )r  r   rv   ry   rU  rd  ru   )re   ry   rv   rw   )r  z;Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]rv   r   )rq   r	   r  r	   rv   r}  )r  rA   rv   r   )%r{   r|   r}   rg  r   r]   rh  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r^  r`  r  r  r  r  rO  r  r&  r'  s   @rT   rf  rf    s   M3FC9=I6= PT#/(#/HL#/	#/J & &  (,'+	FF %F %	F
 
F F:"" SW"" *"3>"FO"	"


"" " &	"
 ;" 
5""" &" 	"
 #" 
""'"
" (" 
!""'" (" 	"
 " 
!"" 4804"" C" &	"
 $" " 1" ." 
" " " 37V$V V 	V
 0V 
V<""&0"9="FJ"	"
"4&)P''
/P/	/0,3rV   rf  c                  8    e Zd ZU dZded<   dZded<   dZded	<   y)
r  r  zClassVar[str]r  Nr   r   r  ry   ops_name)r{   r|   r}   r  r   r   r  r   rV   rT   r  r  M	  s!    "C"#'E 'HcrV   r  c                 b    	 dd l } | j                  | j                        S # t        $ r Y y w xY w)Nr   )	undefined)jinja2EnvironmentStrictUndefinedImportError)r  s    rT   
jinja2_envr  U	  s?    !!,, " 
 	
  s   " 	..c                      e Zd ZdZe	 d	 	 	 	 	 	 	 dd       Zedd       Ze	 	 	 	 dd       ZdddZe	dd       Z
e	dd       Zdd	Z	 	 	 	 	 	 dd
ZddZy)KernelTemplatezg
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUDATemplate
    c                    | j                  d      }t        |      dkD  r|dd  D cg c]  }d|z  |z  |z    c}|dd  dj                  |      S c c}w )NTrH   r  r  )
splitlinesr  r   )sourcenum_indentsindents_spacinglinesr  s        rT   indent_except_firstz"KernelTemplate.indent_except_firsth	  sd     !!$'u:>INqrAE&4<E!"I wwu~s   Ac                    t               }|y t        j                  |j                  d<   ddlm} 	 |j                  |       S # |$ r} G d d|      } ||      |d }~ww xY w)Nr  r   )TemplateSyntaxErrorc                  (     e Zd Zd fdZddZ xZS )IKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrorc                    t         |   |j                  |j                  |j                  |j
                         || _        y rl   )r  r   messagelinenore   filenameoriginal_error)rh   r  r  s     rT   r   zRKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__	  s>    G$&..&--&++&// +9D'rV   c                F   d| j                    d}|d| j                   dz  }t        | j                  d      r| j                  j                  j                  d      }|dz  }t        d| j                   dz
        }t        t        |      | j                   dz         }t        ||      D ]s  }|| j                   dz
  k(  rN||dz    d	||    dz  }t        | j                  d
      s=|dd| j                  j                  dz
  z  z   dz   z  }c||dz    d||    dz  }u |S )NzError in template at line 
zError message: r  z	Context:
r   r   rH   z: --> columnz     r  z^
z:     )r  r  r  r  r  splitmaxminr  r  r  )rh   
error_infor  startendrw  s         rT   r  zQKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__	  sA   #=dkk]"!MJODLL>"DDJt22H= $ 3 3 : : @ @ F"l2
 #At{{Q 7!#e*dkkAo>!&uc!2 
KA DKK!O3 *QveAhZr.J J
#*4+>+>#I$.(/*-1D1D1K1Ka1O*P)Q*/)0%&J !+QveAhZr.J J

K &%rV   )r  r  rv   rw   rx   )r{   r|   r}   r   r  r&  r'  s   @rT   DetailedTemplateSyntaxErrorr  	  s    9&rV   r  )r  r  r  filtersr  r  from_string)r  envr  er  s        rT   _template_from_stringz$KernelTemplate._template_from_strings	  sk    l;-;-O-O)*.%	8??6**" #	8&.A &B .a0a7G#	8s   A A!AA!c                   t         j                  j                  t        | t        t
        f      r.| D ci c]!  }|j                         |j                         # c}n | j                         | j                         idfd}|S c c}w )Nc                >    j                  |       }||S  |       S rl   )rX  )re   r  _get_dtype_reallookups     rT   r   z1KernelTemplate._fake_get_dtype.<locals>.get_dtype	  s'    ZZ%F!"4((rV   )re   ry   rv   r   )r7   r   r   r3  r  r  r   )	fake_outsr  r   r  r  s      @@rT   _fake_get_dtypezKernelTemplate._fake_get_dtype	  sr     ''++i$/AJK#cllncmmo5KF((*I,?,?,ABF	)  Ls   &B
Nc                     || _         || _        y rl   )re   _hash)rh   re   r  s      rT   r   zKernelTemplate.__init__	  s    	
rV   c                    | j                   S )a  
        entry point to override for templates to ensure a uid e.g. through a prefix

        the purpose of this is that every KernelTemplate/ExternKernelChoice is unique
        in the system, but reproducible e.g. restarting pytorch should yield the same id
        r  rm   s    rT   uidzKernelTemplate.uid	  s     yyrV   c                    | j                   S )a  
        source hash for a Template.

        Templates can optionally provide a src hash to make it easier to cache/validate that
        a template has not changed from one version to another. Override this if that detection
        is different for your specific Template
        )r  rm   s    rT   src_hashzKernelTemplate.src_hash	  s     zzrV   c                X    g } | j                   |fi |}|t        |      dk(  r|d   S y)z
        Maybe generates a new ChoiceCaller and returns it, or None if generation fails.

        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        NrH   r   )maybe_append_choicer  )rh   r  temp_choicesr  s       rT   choice_or_nonezKernelTemplate.choice_or_none	  s>     #%))),A&A>c,/14?"rV   c                   	 |j                   | j                  di |       y# t        $ rQ}t        j	                  d|t        |       t        j                         t        j                  k         |cY d}~S d}~ww xY w)a%  
        Maybe generates a new ChoiceCaller and appends it into existing choices.
        Returns None if success, otherwise returns the error.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_infor   )	r  rM  r   rC  inforI  getEffectiveLevelrP   INFO)rh   choicesr  r  s       rT   r  z"KernelTemplate.maybe_append_choice	  sn    
	NN=4==2623" 	HHET
002W\\A	   H	s   !$ 	A>AA93A>9A>c                    t         )zM
        Generates a ChoiceCaller instance from the given arguments.
        r  )rh   r  s     rT   rM  zKernelTemplate.generate	  s
    
 "!rV   )   )r  ry   r  r   r  r   rv   ry   )r  ry   rv   r	   )r  zUnion[list[Buffer], Buffer]rv   zCallable[[str], torch.dtype]rl   )re   ry   r  r   rv   rw   rx   )rv   zUnion[str, None])r  r	   rv   zOptional[ChoiceCaller])r  rz   r  r	   rv   zOptional[NotImplementedError])r  r	   rv   r?   )r{   r|   r}   r~   r   r  r  r  r   r   r   r  r  r  rM  r   rV   rT   r  r  a	  s     >?"%8;	  ,8 ,8\ .	% "    
 ,/	&."rV   r  c                  6    e Zd ZdZd Zd fdZddZddZ	 	 d	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 ddZ	ddZ
ddZ	 d	 	 	 	 	 	 	 	 	 dd	Zdd
ZddZddZ	 	 	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 	 	 d dZ	 	 d!	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dZ xZS )#r  z~A ops handler that proxies calls to `kernel` and its
    handler and returns `CSEVariable`s with correct shape and dtype.
    c                b    t         |           ddlm}  |       | _        || _        || _        y )Nr   ValueRangeAnalysis)r  r   r  r  vr_analysisr  parent_handler)rh   r  r  r  r  s       rT   r   zCSEProxy.__init__ 
  s+    /-/,rV   c           	     d  
  | j                   gi  t        | j                        i }t               }t	               }t               
t        |      }d d dk(  r
dk(  r|j                  |j                  nydk(  rT
dk(  rOt        j                  j                  j                  j                  t        j                  d       j                  d n 
dv rt        |      } |i  |i 
dv rJ dd
fd}	t        j                   |	|      S )	Nmaskedrl  rj  )rl  rj  rw  )rl  rj  r   c                   t        	t        t        f      r	
   n	}t        t        t        f      r,t              dkD  rt        d   t        t        f      r
   n}
dz  
t        | t              r+dk(  r| j
                  || _        | j                  || _        t        j                  j                  j                  t        j                  j                  | 	      }|j                         t        j                  j                  st        j                  j                   r)|J t#        t        j                  j                  ||       t        j                  j$                  r)J t'        t        j                  j                  |       t        j(                  r$t+        t        j                  j                  |       |S )Nr   rH   rj  r  r   r  )r3  r  r  r  r}  r   r  r7   r  rr  rM  rl  r  r   r  r  r  r  r  r  runtime_triton_nan_assertsr  )r  	var_dtype	var_shapecsevarrq   r  r  r  re   r  
output_idxoutput_shapes       rT   do_csez!CSEProxy._default.<locals>.do_cse)
  s   
 lT5M: Z(!  lT5M:%)|Au> Z( "  !OJ ![)e#'AG77?'AGXX\\**  "" + F !!$f5 ##??&&>> ,,,AHH,,fi@"">>#///AHH,,flC00!((**F3MrV   )r  zUnion[str, CSEVariable]rv   r}  )_bound_variabler  r  r!   r$   r(   r   r  r7   interpreterrJ  r  rX  r  r  pytreetree_map)rh   re   rq   r  rg   dtype_handlershape_handlershape_opdtype_opr  r  r  r  r  r  s    ```      @@@@@rT   _defaultzCSEProxy._default
  s@   %%%d<T<V<2++T2DCFC2424%'=$/88 3 ;;L ;;LX'U"2==55::>>#''e   L00}d3H#T4V4L#T4V4L''+++
0	 0	d vu--rV   c                  	 ddl m} ddlm} ddlm} t        t        j                  |      rt        j                         S t        t        j                  |      rt        j                         S t        t        j                  t              rt        j                         S t        j                  j                  		j                  |k(  r| j                  j                  t        | j                  j                  t               s$J t#        | j                  j                               | j                  j                  j%                  	t        j                               S t&        j(                  rjt+        ||      r^t-        	fddD              rt        j                         S |rJ d	d}t/        t1        ||            } t3        | j4                  |      | S t        j                         S )
z
        If the variable comes from an FX node, we forward the bound we have already computed
        Else, if the variable when codegen'ing another op, we try to compute its bounds
        r   r  )TritonTemplateKernelrH   )CUDATemplateKernelc              3  :   K   | ]  }|j                   v   y wrl   )r  )r  r  fx_nodes     rT   r  z+CSEProxy._bound_variable.<locals>.<genexpr>x
  s     V11&Vs   )set_indirectr  r  c                    t        | t              r| j                  S t        | t        j                        rt        |       S | S rl   )r3  r}  r  r   r  r   r0  s    rT   arg_to_boundz.CSEProxy._bound_variable.<locals>.arg_to_bound
  s2    a-88O5::.&q>)HrV   )r1  r	   rv   r	   )r  r  select_algorithmr*  cuda.cuda_kernelr+  r3  r7   r  r   rV  r!  r1   rJ  r  rw  dictrI  rX  r   compute_all_boundsr  r  r  r  r  r  )
rh   re   rq   r  r  r*  r+  r0  
arg_boundsr-  s
            @rT   r   zCSEProxy._bound_variable]
  sm   
 	0;8ahh 45&&((ahh 23&&((amm[1&&((--,,>>T!dkk&@&@&Ldkk88$? **B ? ;;--11';;N;N;PQQ&&73Et+L V0UVV"**,, : c,56J274++T2J??""$$rV   c                P   t        |t              rt        j                  |      }t        |t        j                        sJ t        |      |f       |j                  j                  dk  r|rt        j                  |t        j                  |t        j                              }|j                  j                  dk\  r0t        j                  |d      }t        j                  |||      }n|}t!        j"                         }|j                  t!        j"                         k7  rt        |t        j$                        r|j                  t!        t&         d      z  }t!        |j                  |z   |j                  |z         }|j                  j                  dk\  r"|j                  t!        dt&              z  }	||	z  }| j(                  j*                  j-                  | j(                  j.                  |||j0                  |j2                        }| j4                  j7                  |||      }
t9        |      ro|j                  j                  dk\   }t        |t        j$                         xs |j                  j                  |k   }| j(                  j;                  |
|||       |
S )Nr   r  r  )r3  r   r   r  r  rI  r  r  r2   rK  r  r   longr  ltri  r   rV  Numberr   r  rr  rM  rl  r   r  r  r  r'   r  )rh   r  r   r  r  stmr8  
new_bounds
neg_boundspos	sympy_varassert_lowerassert_uppers                rT   r  zCSEProxy.indirect_indexing
  s    dC ==&D$

+?d4j$-??+ ::aggc3>>$

#CD::##q(QB))BS1C %,,.Jzz[0022z$7U !ZZ+vgr*BB
($$t+Z-=-=-D
 ::##q(**{1f'==C!+c!1J++//**##!iiii + C ''99#tUK	5! #

 0 0A 56L)$== 

  4'BL KK$$YlLQrV   c                >    | j                   j                  ||||      S rl   )r  r  r  s        rT   r  zCSEProxy.check_bounds
  s     {{''dE5AArV   c                   || j                   j                  j                  v r)t        j                   j                  j                  |       t        |t        j                        r| j                   j                  ||      S | j                   j                  j                  }||v r||   S | j                   j                  ||      }|j                  dk(  r| j                   xj                  dz  c_        |S r-  )r  rr  r,  r7   rs  rK  r   r   TMPr  r(  r  r  ro  )rh   re   r  r(  outs        rT   r  zCSEProxy.load
  s    4;;??555 HH&&**40udhh/;;,,T599kkoo11;t$$kktU+ ==AKK  A% 
rV   c                l   || j                   j                  j                  |<   | j                   j                  r{|t        j
                  j                  v r^| j                   j                  j                  |      }|j                         D ]%  }|| j                   j                  j                  |<   ' y y y rl   )	r  rr  r(  rJ  r7   r   name_to_buffer
get_outputget_mutations)rh   re   rg   r  
other_names        rT   _update_store_cachezCSEProxy._update_store_cache
  s    ,1##D);;##0F0F(F++**55d;C!//1 @
:?++J7@ )G#rV   c                ,   | j                   j                  j                  |       || j                  ||       |t        j
                  j                  vr?| j                   j                  ||||       | j                   xj                  dz  c_        y y )N)r\   rH   )	r  rt  rK  rJ  r7   r   r  r  rp  r  s        rT   r  zCSEProxy.store
  sx     	&&**40<$$T51qww...KKdE5t<KK!!Q&! /rV   c                <    | j                   j                  ||       y rl   )r  r  r  s      rT   r  zCSEProxy.device_assert_async
  s    ''c2rV   c                6     | j                   j                  |  y rl   )r  r  rp   s     rT   r  zCSEProxy.partial_accumulate
  s    &&&-rV   c                "   | j                   j                  j                  |       | j                  ||       |t        j
                  j                  vr<| j                   xj                  dz  c_        | j                   j                  |||      S y r-  )	r  rt  rK  rJ  r7   r   r  rp  r  r  s       rT   r  zCSEProxy.store_reduction
  so    &&**40  u-qww...KK!!Q&!;;..tUEBB /rV   c                |    | j                   xj                  dz  c_        | j                   j                  ||||      S r-  )r  rq  r  r  s        rT   r  zCSEProxy.reduction
  s4     	!!Q&!{{$$UI~uMMrV   c                <    | j                   j                  |||      S rl   )r  r  r  s       rT   r  zCSEProxy.scan  s     {{
F;;rV   c                >    | j                   j                  ||||      S rl   )r  r  r  s        rT   r  zCSEProxy.sort  s     {{
CCrV   c           	     D    | j                   j                  |||||||      S )a  
        [Note: Inductor bucketize op]

        Inputs:
        -------
        values: the values to be bucketized.
        boundaries: a tuple containing
          (a) the name of the boundaries tensor (which must be sorted, unless
          the sorting tensor is present),
          (b) the length of the tensor in the last dimension (i.e. the length of
          one set of boundaries),
          (c) the number of elements in the underlying storage (i.e. the length
          of the flattened tensor, ignoring striding), and
          (d) the stride of the tensor in the last dimension.
        boundary_indices: indices into a flattened version of the boundaries
        tensor, of the same size and shape as "values".  Each index points to
        the first element in the set of boundaries to be used for the
        corresponding value.
        indexing_dtype: the dtype to use when indexing into the boundaries
        tensor.  This must be int64 or int32.  This additionally specifies the
        dtype of the return value.
        right: see "Details" below.
        sorter: an optional tuple containing
          (a) the name of an optional sorting tensor, used to access unsorted
          boundaries without reordering the boundaries tensor, and
          (b) the stride of the tensor in the last dimension.
        The values in the sorting tensor are used as indices into the *last*
        dimension of the boundaries tensor, with all other indices matching.
        The size of the sorting and boundaries tensors must be equivalent.
        sorter_indices: must be present if the sorting array is present; see
        "boundary_indices" for the equivalent definition for the boundaries
        tensor.

        Output:
        -------
        The buckets each value belongs in, within a given set of boundaries.  0
        indicates a position before the first boundary, and len(boundaries_set)
        represents a position after the last boundary.

        Details:
        --------
        Given a value and a set of boundaries, calculate the bucket that each
        value belongs to.  This works differently in 1-D and N-D cases.

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

        Note that in the N-D boundaries case, the shape of "values" and
        "boundaries" must match in every dimension _except_ the last.

        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

        Boundaries must be non-decreasing, or a sorter must be provided which
        would re-index offsets in a non-decreasing order (e.g. the second output
        of torch.sort(offsets)).  Otherwise, the result is undefined.
        )r  r  r  s           rT   r  zCSEProxy.bucketize  s1    L {{$$
 	
rV   )r  zKernel[Any]r  zOpsHandler[Any])re   ry   rq   ztuple[Any, ...]r  r  rv   r	   )re   ry   rq   r	   r  r	   rv   r  r  )
r  r}  r   r  r  r   r  r   rv   r  r  r  )re   ry   rg   r}  rv   rw   rl   r  r  )rq   r	   rv   rw   r  r  r  r  r  r  )r{   r|   r}   r~   re   r   r(  r   r  r  r  rJ  r  r  r  r  r  r  r  r  r&  r'  s   @rT   r  r  	  s    D-S.j.%h 55 %5 	5
 5 
5nBB&0B9=BFJB	B
"@ SW'' *'3>'FO'	'3.CNN N &	N
 ;N 
5N	<'	<
	< (	< 
!	<D'D (D 	D
 D 
!D  4804N
N
 CN
 &	N

 $N
 N
 1N
 .N
 
N
rV   r  )rS   ry   rv   rw   )NNNN)r   ry   r4  r   r5  r   r6  r   r7  r   r8  Optional[CustomGraphModulePass]r9  Optional[ConfigModule]rv   rw   )r   Union[torch.device, str, None]rv   zOrderedSet[BackendFeature])r   rU  rO  r<  rv   r   )r   ry   rv   zOptional[SchedulingConstructor])FF)r   ry   rS  r   rT  r   rv   r   )r   ry   rv   rS  )r   ry   rv   rT  ru   )r  Sequence[sympy.Expr]r  rV  r  rV  rv   r   )r   ry   r  r  rv   rw   )r   ry   rv   r  )r  ry   rq   r	   r  r	   rv   r   )r   r)   r  r  r   r   rv   rw   )r   r)   r  r  r  rG   rv   rw   )r   r)   r  r  rv   rw   )rv  ry   rv   r   r   r  )rv   r	   )
__future__r   ra   r  dataclassesenumr  r*  rP   rD  r  rc   rer_   abcr   r   r   r   r   typingr	   r
   r   r   r   r   r   r   typing_extensionsr   r   r   r   torch.fxtorch._prims_commonr   torch.utilsr   r"  torch.utils._config_moduler   torch.utils._ordered_setr   torch.utils._sympy.numbersr   torch.utils._sympy.printersr   _PythonPrintertorch.utils._sympy.symbolr   r   r   torch.utils._sympy.value_rangesr   r   r  r   r    dtype_propagationr!   ops_handlerr"   r#   shape_propagationr$   utilsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   virtualizedr1   r2   r3   r4   r5   r6   r7   collections.abcr8   r9   r:   r;   r<   custom_graph_passr=   r   r>   r?   r@   rA   r  rC   r  rD   rE   rF   rG   r  rJ   rK   r   rI  r   ry   r  r  rp  _logginggetArtifactLoggerr{   rN   	getLoggerrC  rU   	dataclassrX   r   r   r   r   r   r   r   r   r   KernelArgTyper  r   r  r/  r0  r1  r:  r<  rK  rP  rJ  rV  rY  r[  cacherH  r  r  r  bfloat16r  float16r   r8  float64int8int16r/  r  r   uint16r  uint64r  r  r  r  r  r  r)  compile
IGNORECASEr~  ry  r{  r  r3  INT_TO_FLOATr  r  r  r  r  r  r  r  r  r}  r   r  r  r   ReductionCacheKeyr$  rX  rf  r  r  r  r  r   s   0rT   <module>r     s.   "          	 	  #  	 	 	 ,    ? ) 3 / - G O O D  : ; :      LL$9>>$DD2-	B$hy&9%:N%JK23sELL()J F~~//*Eg!=
   >/		 /(C  Td= d dN* * # # #       ! ! ! < < < lIw8H,VW,.) .5" 5"p :< 6 ;DF A FDF  A F8 @D>B:>37BB,B /B !=	B
  <B 8B 1B 
B4
&T 
&3*33$3*35C3	3U
 @E"8<!-6 c cLUU$U  U 	U;;&7;	;	, 
NNEKK	MM5;;> JJMMMMJJKKKKKKKKLLLLLL
 	u> : ,''' ' 	'T::!0:9D:	:2	L	L!0	L9G	L		L
aB aBH=N =$S1 S1l "rzz";2==Q O;#%5z# O;d - - -  6: `6;HH-`6 ;HH/- 	`6 ;HH/- 	`6$ ;HH/- 	%`60 ;HH/- 	1`6< ;HH*)	=`6L ;HH(0	M`6X 	;HH66>	Y`6h ;HH1i`6r ;HH2s`6| ;HH1}`6F ;HH2 G`6P ;HH%8$Q`6^ 	;HH&%		_`6j ;HH%8	k`6v 	;HH&	w`6@ ;HH+A`6L %;HH88)	M`6X %;HH88)	Y`6d %;HH8)e`6n %;HH8)o`6z 
;HH'
{`6D ;HH(E`6N ;HHc	O`6^ ,;HH?0_`6h ,;HH?0i`6t &;HH9*u`6~ 
;HH*
`6H );HHD-I`6R );HHD-S`6\ );HHD-]`6f );HHD-g`6p (;HHC,q`6z $1;HHL5${`6D $1;HHL5$E`6N $1;HHL5$O`6X $1;HHL5$Y`6b ';HHB+c`6l (;HHC,m`6v (;HHC,w`6 2 `F	-# -"> *Z 
 N N N 
 ,X Xv
#; #;L 5+;Tk5c!1223	5l'/=0
1 l^
< 
<p3Wgo. p3f	     U" U"pm
~ m
;s   
f