
    qi                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm c m!Z" d dl#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3 ddl4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z>m Z  ddl?m@Z@ ddlAmBZBmCZCmDZDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZM ddlNmOZOmPZPmQZQmRZR ddlSmTZTmUZU ddlVmWZWmXZXmYZYmZZZ ddl[m\Z\ ddl m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZjmkZk ddllmmZnmoZompZpmqZq ddlrmsZs d d!ltmuZu d d"lvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZ d d#lmZmZmZmZmZmZmZ d d$lmZmZmZmZmZ d d%lmZ er,d d&lmZ d d'lmZ d d(lmZ dd)l=mZ d d*lvmZ d d+lmZ  ed,      Z ejD                  e      ZejJ                  jM                  ed-      ZejJ                  jM                  ed.      ZejJ                  jM                  ed/      Z e@       Z?d0 Zddd1Z G d2 d3      Z ed      ded4       Z ed      ded5       Z G d6 d7      Zej`                   G d8 d9             Zej`                   G d: d;             Zej`                   G d< d=e             Zej`                   G d> d?e             Z	 	 	 	 	 	 	 	 dfd@ZdA Z G dB dCe      Z e       jp                  ZdgdDZdgdEZdhdFZdgdGZdidHZdjdIZ G dJ dKe{      ZdkdLZdldmdMZ G dN dOe      ZeÐj                  dP        G dQ dReë      Z G dS dT      Zej`                   G dU dV             Z G dW dX      Zej`                   G dY dZ             Z G d[ d\ezeeeeeef   f   f         Zej`                   G d] d^             Z G d_ d`ee         Z G da dbe      ZdndcZy)o    )annotationsN)abstractmethod)CallableIterableSequence)	lru_cache)AnycastOptionalTYPE_CHECKINGTypeVarUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)get_triton_versionhas_triton_packagehas_triton_stable_tma_api   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetricsutils)AsyncCompile)	code_hashget_pathPyCodeCachewrite_atomic)'set_kernel_post_grad_provenance_tracing)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)get_broadcasted_shape)cache_on_selfDelayMaybeLineDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferis_buffer_removedOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRootPartialAccumulate
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleTypeDtypePropagationOpsHandler)ShapeEnv)IRNode)BlockShapeType)SIMDKernelFeatures_T
perf_hintsschedulefusionc                <    | dv }|rdnd}| dv r| d|  dS | d|  S )N)anymaxminprodtriton_helperstl)rz   r{   .2 )reduction_type
use_helpermodules      d/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/triton.pyget_triton_reduction_functionr      sE    #@@J!+F'>*!,,>*++    c                    t        | t        j                        syt        | t        j                        xs& | j                  xr t        | j                        dk(  S )z "
    Is this expression a Sympy Integer or is it an integer sympy Expr
    containing no free symbols. The latter case can happen with Identity expr.
    Fr   )
isinstancesympyExprInteger
is_integerlenfree_symbolsexprs    r   is_sympy_integer_liker      sI    
 dEJJ'dEMM* 7C 1 12a7r   c                  @    e Zd ZU dZi Zded<   i Zded<   edd       Zy)	OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsc                    |j                   }t        t        j                  t        j                  g      | j
                  |<   || j                  |<   y N)__name__r   torchfloat32float64r   r   )clsfuncconvert_outputop_names       r   register_upcastzOpDtypeSupport.register_upcast   s=    --(2EMM5==3Q(RW%'5G$r   N)r   zCallable[..., str]r   boolreturnNone)	r   
__module____qualname____doc__r   __annotations__r   classmethodr   r   r   r   r   r      s1    
 <>8=')O_)6 6r   r   c                 d    t               syddl} t        | j                  j                  d      ryy)zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   triton.compiler.compilerhasattrcompiler)tritons    r   gen_attr_descriptor_importr      s-     # v''):;Er   c                     t               } | j                  d       t               x}r| j                  |       | j                  d       | j	                         S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rV   splicer   	writelinegetvalue)imports	attr_descs     r   gen_common_triton_importsr      s[    GNN	 /00y0)$NN	 r   c                     e Zd ZdZ eej                  ej                  g      Z eej                  ej                  ej                  ge      ZeD  ci c]%  }|t        j                  t        |    ddd      ' c}}}} ZeD  ci c]3  }|t        j                  t        |   j#                          ddd      5 c}}}} Zedd       Zedd       Zedd	       Zy
c c}}}} w c c}}}} w )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    offsetTintegernonnegativeBLOCKr   positivec           
        d}|j                   }|D ]  }t        |t        j                        r?t        j
                  j                  j                  |j                     }|j                  }nht        |t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  f      rd}n| j                   D cg c]  }t        ||      s| }}t#        |      dk(  sJ d|j                          |d   }	t        j
                  j%                         }
dg|
z  }t        j
                  j'                         D cg c]  }t(        |	   |j*                  k(  r| }}t#        |      dk(  sJ d       t-        | j/                  |d               ||d   j0                  <   t3        |      }t5        ||      } |J |S c c}w c c}w )Nr   rN   Ambiguous type: r   1z# of Match expected to 1)r   r   r   TMPrL   kernelcsevarname_mapnameshapeUNBACKED_INTSIZEPRECOMPUTED_SIZEINDEXFLOATUNBACKED_FLOATblock_typesr   triton_tensor_ndimactive_range_treesr   prefixstrget_block_size
tensor_dimtupler9   )r   r   
expr_shape	expr_varsvarcse_var	var_shapesymtsymbol_matchessymndimr   tree
tree_matchs                 r   get_block_shapezTritonSymbols.get_block_shape   s    &(
%%	 %	FCc488,((,,22388<#MM	%%II))JJJJ''
 	 &)__"!sD8QD" " >*a/N3CCHH:1NN/$Q'xx224 !" ; ; =!#$++5 
 
 :!+G-GG+25c6H6HTU6W2Xjm../!%L	 /z9EJK%	FN %%%1"s   %G+7G+(G0c                4    | j                   |j                     S r   )block_sizesr   r   r   s     r   r   zTritonSymbols.get_block_size  s    tyy))r   c                4    | j                   |j                     S r   )block_offsetsr   r   s     r   get_block_offsetzTritonSymbols.get_block_offset  s      ++r   N)r   
sympy.Exprr   rr   )r   ra   r   zsympy.Symbol)r   r   r   r   r   r   R0_INDEXR1_INDEXreduction_typesXBLOCKYBLOCKZBLOCKr   r   Symbolr   r   upperr   r   r   r   r   ).0r   r   r   s   0000r   r   r      s    !$--!?@Odkk4;;VoVWK    	ellj./v6RVWWM  	   	ell$%%'(.t
 	
K 4 4l * * , ,Q
s   *C(
8C0
r   c                      e Zd ZU ded<   ded<   ded<   ded<   d	ed
<   ded<   ddZddZddZddZddZe	dd       Z
y)IndexingOptionsr   	index_strOrderedSet[str]	mask_varszOptional[str]
expand_strr   _has_rindexr   indexz#Optional[Sequence[Union[int, str]]]expand_shapec                ,    t        | j                        S r   )r   r   selfs    r   has_maskzIndexingOptions.has_mask-  s    DNN##r   c                J    t        | j                  t        j                        S r   )r   r   r   r   r   s    r   has_indirectzIndexingOptions.has_indirect0  s    "4::txx88r   c                    | j                   S r   )r   r   s    r   
has_rindexzIndexingOptions.has_rindex3  s    r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)tmpNr   
startswithr   masks     r   	<genexpr>z.IndexingOptions.has_tmpmask.<locals>.<genexpr>7  s     J43t9''.J   $&ry   r   r   s    r   has_tmpmaskzIndexingOptions.has_tmpmask6  s    J4>>JJJr   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |      j                  d          yw)rNr	  r  s     r   r  z,IndexingOptions.has_rmask.<locals>.<genexpr>:  s     H3t9'',Hr  r  r   s    r   	has_rmaskzIndexingOptions.has_rmask9  s    HHHHr   c                    | j                   r2dj                  t        t        t        | j                                     S dS )N & r   )r   joinsortedmapr   r   s    r   mask_strzIndexingOptions.mask_str<  s4     =ANNEJJvc#t~~678	
PV	
r   Nr   r   r   r   )r   r   r   r   r  r  r  r  r  propertyr  r   r   r   r   r   $  sT    N55$9 KI 
 
r   r   c                     e Zd ZU dZded<   ded<   ded<   ded	<   d
ed<   ded<   d
ed<   ded<   dZded<   dZded<   ed'd       Zed'd       Z	ed'd       Z
ed'd       Zedd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d(d       Z	 	 	 	 	 	 	 	 d)dZd*dZ	 	 	 	 	 	 d+dZd,d Zd-d!Zd-d"Zd-d#Zd-d$Zd-d%Z	 	 	 	 	 	 	 	 	 	 	 	 d.d&Zy)/BlockDescriptorOptionsz
    This is a base class that describes a block descriptor used in Triton kernels.
    It can be used to create either a tensor descriptor (with TensorDescriptorOptions)
    or a block pointer (with BlockPtrOptions).
    BlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shape
list[bool]broadcasting_dimsfinal_shapeBlockParameters.StrideSorterstride_sorterNzOptional[list[int]]_boundary_checkFr   can_liftc                .    | j                   j                  S r   )r!  r   r   s    r   r   zBlockDescriptorOptions.shape^  s    {{   r   c                .    | j                   j                  S r   )r!  block_shaper   s    r   r0  z"BlockDescriptorOptions.block_shapeb  s    {{&&&r   c                .    | j                   j                  S r   )r!  stridesr   s    r   r2  zBlockDescriptorOptions.stridesf      {{"""r   c                .    | j                   j                  S r   )r!  offsetsr   s    r   r5  zBlockDescriptorOptions.offsetsj  r3  r   )r-  c                  t         j                  j                  d	fd} ||j                        |_         ||j                        |_        |j
                  D 	cg c]  }	j                  |	d       }
}	t        |
      rd|
d<   |j                  |
      }|j                  |t         j                  j                        \  }}|j                  D cg c]  }j                  |d       }}|j
                  }|j                  |      }|D cg c]  }t        j                  |       }}t         j                  j                  r%|d   j                  dk(  sJ |j!                  d       t         j                  j"                  }t         j                  j$                  st'        |j                        t'        t         j                  j(                        |z
  k(  rIt         j                  j*                  j-                         r!|t.        j0                  j2                  g|z  z  }	 t5        j6                  t         j                  j                  |j                        } | |t         j                  j                  jA                  |      |||||||	      }|jC                  ||       |S c c}	w c c}w c c}w # t8        $ r3 t;        t=        t?        t'        |j                                          }Y w xY w)
z2Helper to create a BlockDescriptorOptions instancec                L    | D cg c]  }j                  |       c}S c c}w r   )lookup_precomputed_size)exprsr   sizevarss     r   lookup_sizez2BlockDescriptorOptions.create.<locals>.lookup_size~  s"    GLMtH44T:MMMs   !rN   F)stride_sorter_cls	shape_envr   x)	r!  r"  r$  r   r)  r&  r(  r+  r-  )r9  zIterable[sympy.Expr]r   list[sympy.Expr])"rL   graphr:  r   r2  r0  statically_known_equalsallremove_dimsmaybe_sort_with_stride_order
_shape_envr   r   r   no_x_dimr   popnum_reduction_dimsinside_reductionr   numelsfeaturesis_reductionr   SOner%   argsort_symAssertionErrorlistreversedranger8  compute_boundary_check)r   r!  r"  range_treesr   get_max_blockr=  r-  r;  dimsingleton_dimsr+  strider(  r&  r   r)  reduction_ndimr$  resultr:  s                       @r   createzBlockDescriptorOptions.createn  s    77##	N #6<<0$V^^4 AG@R@R
9<H,,S!4
 
 ~!&N2 ##N3 !' C C/177;M;M !D !
 GMnn
<BH,,VQ7
 
 !,, ##$56 GRRd}33D9RR88q>((C///OOA44))FNN#s188??';n'LL!!..0 EGGKK=>99K	? %%agg&8&8&..IE
 GG,,DD_U#+/'

 	%%m[AC
$
 S$  	?%FNN(;"<=>E	?s$   J-J2J7)8J< <9K87K8c                D    t         j                  |   }t        |||i      S zN
        Replaces instances of {symt}_offset with the new expression.
        r   r   rE   r   r   replacementr   roffsets        r   replace_offsetz%BlockDescriptorOptions.replace_offset  &      --d3$+ 677r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S Nr   r   r   rd  r   r   r   r   r   s      r   remove_roffsetsz&BlockDescriptorOptions.remove_roffsets  ;    !11 	ED&&tU]]1-=tDD	Er   c           
        t         j                  j                  }|D ci c]7  }t        j                  |j
                      |t        |j
                           9 }}t        t        t         j                  j                  |            }t        t        | j                              D cg c]%  }|j                  | j                  |   t         j"                  j$                        s|r:t        j                  t&        j(                     | j*                  |   j,                  v sb|j/                  | j                  |   | j*                  |         s|j/                  | j                  |   t1        | j*                  |   |            sMt         j                  j2                  r1| j*                  |   t        j                  t&        j4                     k(  s|( c}| _        yc c}w c c}w )z6List of indices to pass to tl.load(boundary_check=...)N)rL   rA  r:  r   r   r   r   ry   r  r   needs_yz_grid_overflowrT  r   r   rB  r2  r   rN  Zeror   r   r0  r   statically_known_multiple_ofrE   rG  r   r,  )r   rW  rV  r:  tblock_to_maxneeds_overflow_grididxs           r   rU  z-BlockDescriptorOptions.compute_boundary_check  s    77## !/
 %%aff-}Z=O/PP/
 /
 "#ahh&E&E{"ST S_- 
44T\\#5FU ,)55dkkB++C0==> %AA JJsOT-=-=c-B !) E E JJsO&t'7'7'<lK! HH%%((-1J1J4;;1WW-  
/
 
s   <G*D+G#c                6    | j                   J | j                   S r   )r,  r   s    r   boundary_checkz%BlockDescriptorOptions.boundary_check  s     ##///###r   c                     yNFr   r   s    r   r  z#BlockDescriptorOptions.has_indirect      r   c                :    t        d | j                  D              S )Nc              3  P   K   | ]  }t        |t        j                           y wr   )r   r   r   )r   r   s     r   r  z4BlockDescriptorOptions.has_rindex.<locals>.<genexpr>  s%      
  m&C&CD
r  )ry   r0  r   s    r   r  z!BlockDescriptorOptions.has_rindex  s"     
((
 
 	
r   c                "    | j                         S r   )r  r   s    r   r  z BlockDescriptorOptions.has_rmask  s      r   c                     yrw  r   r   s    r   r  z"BlockDescriptorOptions.has_tmpmask  rx  r   c                4    t        | j                               S r   )r   ru  r   s    r   r  zBlockDescriptorOptions.has_mask!  s    D'')**r   c                T   | j                   }| j                  }|rJ| j                  j                  | j                         }| j                  j                  | j                        }t	        ||      D 	cg c]#  \  }}	|	rt
        j                  j                  n|% }
}}	t        |||
      }| j                  j                  s4|s2t        |
      t        |      k(  r| j                  j                  |
      }
t        j                  j                  |xr7 t        |
      t        |      k(  xr t        fdt	        |
|      D              }t        | j                        r(|s&d| dt        j                   j#                  |       d}| j                   }| j                  j                  sj|r| j                  j$                  n| j                  j&                  }d| d| d}|r| j                   n$| j                  j                  | j                         }t        |||      }|S c c}	}w )a  
        Generate a broadcast and a reshape for the block descriptor.
        This restores stride-0 dimensions which were removed from the block descriptor.

        Transposes are also applied to the input using self.stride_sorter:
        if for_store is True:
            - First Broadcast the value. Since self.broadcast_shape is stored in
            descending stride order, it must be reverted to the original order
            since the input value does not have dims with descending strides
            - After, transpose the broadcasted value so that dimensions are in
            descending stride order
            - Finally reshape to the block shape
        else (for load):
            - First broadcast the value to self.broadcast_shape (strides are descending)
            - Then transpose the value so that dimensions no longer have descending strides
            - Finally reshape the block to the final kernel tile shape
        c              3  p   K   | ]-  \  }}j                  |d       xs j                  ||       / ywrN   N)rB  )r   pre_dimpost_dimr:  s      r   r  zGBlockDescriptorOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>b  sH       &GX 00!< G33GXFGs   36tl.broadcast_to(, )	tl.trans()r&  r(  r+  revertzipr   rN  rO  triton_reshapeis_identityr   rL   rA  r:  rC  ry   r   index_to_strsort_idxrevert_sort_idx)r   valueinitial_shaper)  allow_implicit	for_storer&  r(  rX  is_broadcastingpre_broadcast_shapesupports_implicit_broadcast	old_shapepermute_dimsr:  s                 @r   codegen_broadcast_and_reshapez4BlockDescriptorOptions.codegen_broadcast_and_reshape$  s   2 .. 22 "00778L8LMO $ 2 2 9 9$:P:P Q
 ),O=N(O
$_ +EGGKK3
 
 um5HI ""..'(C,<< #'"4"4";";<O"P 77##&4 '
#$K(88   *--@+)N  	$ t%%&/J"5'AHH,A,A/,R+SSTU  ((	!!--  ""++''77 
  wba8E  $$''..t/C/CD  ui=w
s   5(H$r   r@  )r!  r   r"  r   rV  list[IterationRangesRoot]r   r   rW  Callable[[str], int]r=  z"type[BlockParameters.StrideSorter]r-  r   r   r  r   r   rb  r   r   r   r   r   r   r   r   r   )rW  r  rV  r  r   r   )r   r#  r  )r  r   r  r%  r)  r%  r  r   r  r   r   r   )r   r   r   r   r   r,  r-  r  r   r0  r2  r5  r   r]  rd  rj  rU  ru  r  r  r  r  r  r  r   r   r   r  r  F  s    ))!!%% 0/+/O(/ Hd! ! ' ' # # # #  [  [ $	[
 /[ #[ ,[ >[ [ 
 [ [z88-78?C8	8
2
+2
 /2
 
	2
h$
!+aa ,a *	a
 a a 
ar   r  c                      e Zd ZdddZy)TensorDescriptorOptionsc                ,   t         j                  j                  }| j                  dk7  r| d || j                         dn|d || j                         d || j
                         d || j                         g}ddj                  |       dS )	a  
        Codegen a call to tl.make_tensor_descriptor()

        Args:
            name: variable name for pointer
            roffset: unused, but kept for compatibility with BlockPtrOptions.format()

        Returns:
            "tl.make_tensor_descriptor(...)"
        r    + (r  shape=strides=block_shape=ztl.make_tensor_descriptor(r  )rL   r   r  r"  r   r2  r0  r  )r   r   rc  fargss        r   formatzTensorDescriptorOptions.format  s     HH!! ''1, &Qt3345Q7Qtzz]O$q'(1T--./0	
 ,DIIdO+<A>>r   NTr   r   r   r   )r   r   r   r  r   r   r   r  r    s    ?r   r  c                  >    e Zd Z	 	 	 	 	 	 	 	 ddZddZdd	dZd
dZy)BlockPtrOptionsc                D    t         j                  |   }t        |||i      S r_  r`  ra  s        r   rd  zBlockPtrOptions.replace_offset  re  r   c                |    t         j                  D ](  }| j                  |t        j                  d      |      }* |S rg  rh  ri  s      r   rj  zBlockPtrOptions.remove_roffsets  rk  r   c           	        t         j                  j                  }g | j                  }|s|D cg c]  }| j	                  |       }}| j
                  dk7  r| d || j
                         dn|d || j                         d || j                         d || j                         d || j                         d ||       g}d	d
j                  |       dS c c}w )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r   r  r  r  r  r  zorder=zoffsets=ztl.make_block_ptr(r  )rL   r   r  r5  rj  r"  r   r2  r0  r$  r  )r   r   rc  r  r5  r   r  s          r   r  zBlockPtrOptions.format  s     HH!!!DLL/BIJt++F3JGJ ''1, &Qt3345Q7Qtzz]O$q'(1T--./0Qtzz]O$qzl#
 $DIIdO#4A66 Ks   C"c           	         t         j                  |   }| j                  D cg c]A  }| j                  |||      | j                  |t        j
                  j                  |      z
  C }}|S c c}w )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        )r   r   r5  rd  r   rN  rn  )r   r   rblockr   advances        r   advance_roffsetzBlockPtrOptions.advance_roffset  st     **40 ,,

  ##FFD9%%feggllDAB
 
 
s   AA,Nr  r  r  r  )r   r   r   r   )r   r   r   rd  rj  r  r  r   r   r   r  r    s6    88-78?C8	8
7:r   r  c                r   t        |t              rt        |t              sJ |D cg c]!  }t        j                  j	                  |      # }}|D cg c]!  }t        j                  j	                  |      # }}||k(  r| S |D cg c]
  }|dk7  s	| c}|k7  rd|  ddj                  |       dS d}g }|D ]G  }	|t        |      k  r|	||   k(  r|j                  d       |dz  }0|	dk(  sJ |j                  d	       I |t        |      k(  sJ |  d
dj                  |       dS c c}w c c}w c c}w )z<Workaround https://github.com/triton-lang/triton/issues/2836r   ztl.reshape(z, [r  z])r   :rN   r   [])r   rR  rL   r   r  r  r   append)
r  r  	new_shaper   old_shape_strnew_shape_strsrs  expandsizes
             r   r  r    sE    i&:i+FFF?HIeQXX**51IMI?HIeQXX**51IMI% -aAH->UG3tyy'?&@CC
CF "]##c0B(BMM#1HC3;;MM&!" #m$$$$WAdii'(**% JI .s   &D*&D/
D4D4c                     t         j                  j                  j                  j                  syt         j
                  j                  t         j
                  j                               \  } }| dk\  S )NF	   )r   	_inductorr"   r   
enable_pdlcudaget_device_capabilitycurrent_device)major_s     r   enable_pdl_codegenr    sM    ??!!((33zz//

0I0I0KLHE1A:r   c                      e Zd Zd dZd dZd dZd dZd dZd dZd dZ	d dZ
d d	Zd d
Zd dZd dZd dZd dZd!dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZd dZ y)"TritonPrinterc                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrN   libdevice.trunc(r   ).to(r  r   r  _printrL   r   index_dtyper   r   s     r   _print_TruncToIntzTritonPrinter._print_TruncToInt  M    499~"""t{{499Q<89qxx?S?S>TTUV	
r   c                    |j                   rt        t        |            }|S t        j                         rt
        j                  j                  r| }|S d| d}|S )Nztl.full([], z, tl.float64))r   r   intr"   	is_fbcoder   versionhip)r   r   rets      r   _print_FloatzTritonPrinter._print_Float  s\    ?? c$i.C
 
	 EMM$5$5FC 
 !m4C
r   c                    t        |j                        dk(  sJ | j                  |j                  d   t        d   dz
        }| dS )NrN   r   Atom      ?z.to(tl.float64))r   r  parenthesizer   )r   r   r  s      r   _print_ToFloatzTritonPrinter._print_ToFloat  sI    499~"""diilJv,>,DEO$$r   c                    |j                   \  }}|j                  r3|j                  r'| j                  |j                   dt        d   dz
        S | j	                  |      }| j	                  |      }d| d| dS )N % r  r  z!triton_helpers.remainder_integer(r  r  )r  is_nonnegative	stringifyr   r  r   r   quotdivquot_sdiv_ss         r   _print_PythonModzTritonPrinter._print_PythonMod$  sr    II	c3#5#5>>$))UJv4F4LMMT"C 26("UG1EEr   c                   |j                   sJ |j                  \  }}|j                  r3|j                  r'| j                  |j                  dt        d   dz
        S | j                  |      }| j                  |      }d| d| dS )N // r  r  z!triton_helpers.div_floor_integer(z,  r  )r   r  r  r  r   r  r  s         r   _print_FloorDivzTritonPrinter._print_FloorDiv,  s~    II	c3#5#5>>$))VZ5G#5MNNT"C 26(#eWAFFr   c                P    | j                  |j                  dt        d   dz
        S )N / r  r  )r  r  r   r  s     r   _print_IntTrueDivzTritonPrinter._print_IntTrueDiv7  s#    ~~dii
60BS0HIIr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrN   libdevice.floor(r   r  r  r  r  s     r   _print_floorzTritonPrinter._print_floor<  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  r  r  s     r   _print_FloorToIntzTritonPrinter._print_FloorToIntB  r  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS NrN   libdevice.ceil(r   r  r  r  r  s     r   _print_ceilingzTritonPrinter._print_ceilingH  K    499~""" TYYq\!: ;5AUAU@VVWXXr   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS r  r  r  s     r   _print_CeilToIntzTritonPrinter._print_CeilToIntL  r  r   c                    t         j                  j                         rd| j                  |       dS d| j                  |       dS )Nzlibdevice.sqrt(().to(tl.float32))ztl.sqrt_rn(()r   xpuis_availabler  r  s     r   _helper_sqrtzTritonPrinter._helper_sqrtP  sF    99!!#%dkk$&7%88IJJdkk$/00ABBr   c                    d| j                  |j                  d          d| j                  |j                  d          dS )Nlibdevice.pow(r   r  rN   r  )r  r  r  s     r   _print_FloatPowzTritonPrinter._print_FloatPowV  s?    T[[167r$++diiPQl:S9TTUV	
r   c                ,   |j                   d   j                  r;dt        |j                   d          d| j                  |j                   d          dS d| j                  |j                   d          d| j                  |j                   d          dS )Nr   r  r  rN   r  )r  
is_Integerfloatr  r  s     r   _print_PowByNaturalz!TritonPrinter._print_PowByNatural[  s    99Q<""#E$))A,$7#84;;tyyQR|;T:UUVWWT[[167r$++diiPQl:S9TTUV	
r   c                    | j                  |j                  d         }| j                  |j                  d         }| j                  |j                  d         }d| d| d| dS )Nr   rN   r!   	tl.where(r  r  )doprintr  )r   r   cpqs        r   _print_WherezTritonPrinter._print_Whereb  s_    LL1&LL1&LL1&1#Rs"QCq))r   c                   t        |j                        dk(  r| j                  |j                  d         S t        |j                        dz  }t        |      }| j                   ||j                  d|        }| j                   ||j                  |d        }t	        d ||fD              \  }}|dv sJ d| d       d	| d
| d| d| d| d
| d| d| dS )zI
        Helper for max/min code generation.
        cmp: > or <
        rN   r   r!   Nc              3  (   K   | ]
  }d | d  yw)(r  Nr   r   r?  s     r   r  z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>w  s     .!q1X.s   )><zUnexpected comparator: ''r  z * ( z= z) + )))r   r  r  typer   )r   r   cmpmidr   abs          r   _print_min_max_helperz#TritonPrinter._print_min_max_helperh  s    
 tyy>Q;;tyy|,,$))n!4jKKTYYt_-.KKTYYst_-. .1v..1j C$<SE"CC 1#T!AcU"QCtA3d1#Qse1QCrBBr   c                &    | j                  |d      S )Nr  r  r  s     r   
_print_MinzTritonPrinter._print_Min{      ))$44r   c                &    | j                  |d      S )Nr  r  r  s     r   
_print_MaxzTritonPrinter._print_Max~  r   r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   tl_math.abs(r   r  r   r  r  r  s     r   
_print_AbszTritonPrinter._print_Abs  s9    499~"""dkk$))A,78::r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.cos((r   r  r%  r  s     r   _print_OpaqueUnaryFn_cosz&TritonPrinter._print_OpaqueUnaryFn_cos  :    499~""" TYYq\!: ;;LMMr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.cosh((r   r  r%  r  s     r   _print_OpaqueUnaryFn_coshz'TritonPrinter._print_OpaqueUnaryFn_cosh  :    499~"""!$++diil";!<<MNNr   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.acos((r   r  r%  r  s     r   _print_OpaqueUnaryFn_acosz'TritonPrinter._print_OpaqueUnaryFn_acos  r,  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.sin((r   r  r%  r  s     r   _print_OpaqueUnaryFn_sinz&TritonPrinter._print_OpaqueUnaryFn_sin  r)  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.sinh((r   r  r%  r  s     r   _print_OpaqueUnaryFn_sinhz'TritonPrinter._print_OpaqueUnaryFn_sinh  r,  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.asin((r   r  r%  r  s     r   _print_OpaqueUnaryFn_asinz'TritonPrinter._print_OpaqueUnaryFn_asin  r,  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.tan((r   r  r%  r  s     r   _print_OpaqueUnaryFn_tanz&TritonPrinter._print_OpaqueUnaryFn_tan  r)  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.tanh((r   r  r%  r  s     r   _print_OpaqueUnaryFn_tanhz'TritonPrinter._print_OpaqueUnaryFn_tanh  r,  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.atan((r   r  r%  r  s     r   _print_OpaqueUnaryFn_atanz'TritonPrinter._print_OpaqueUnaryFn_atan  r,  r   c                z    t        |j                        dk(  sJ d| j                  |j                  d          dS )NrN   zlibdevice.log2((r   r  r%  r  s     r   _print_OpaqueUnaryFn_log2z'TritonPrinter._print_OpaqueUnaryFn_log2  r,  r   c                    t        |j                        dk(  sJ d| j                  |j                  d          dt        j                  j
                   dS )NrN   zlibdevice.llrint(r   r  r  r  r  s     r   _print_RoundToIntzTritonPrinter._print_RoundToInt  sM    499~"""DIIaL 9:%@T@T?UUVW	
r   c                    t        |j                        dk(  sJ |j                  \  }}|j                  r|dk  sJ t        d| d      | j	                  |t
        d         }d| d| d|  S )	Nr!   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got r   Mulzlibdevice.nearbyint(1e * z) * 1e)r   r  r   
ValueErrorr  r   )r   r   numberndigits
number_strs        r   _print_RoundDecimalz!TritonPrinter._print_RoundDecimal  s    499~"""))Q;;abiajjkl  &&vz%/@A
'yJ<vwhZPPr   N)r   r   r   r   )r   r   r  r   r   r   )!r   r   r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r&  r(  r+  r.  r0  r2  r4  r6  r8  r:  r<  r>  rF  r   r   r   r  r    s    

%
FGJ


YYC


*C&55;NOONOONOOO
Qr   r  c                *    t        t        |             S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)rF   rH   dtypes    r   triton_compute_typerJ    s    *5122r   c                ^    | t         j                  k(  rt         j                  } t        |       S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)r   r   int8rF   rH  s    r   triton_store_typerM    s"    



ur   c                    t        |       r+| j                  r| j                  dk  rt        j                  S t        |       S )z0Implicit upcasts used for Triton reduction types   )r   	is_signeditemsizer   int32rH   rH  s    r   upcast_acc_dtyperS    s0    5??u~~7J{{u%%r   c                *    t        t        |             S )z:Convert torch.dtype to triton type, with reduction upcasts)rJ  rS  rH  s    r   triton_acc_typerU    s    /677r   c                <    | j                   dk  xr | j                  S )Nr!   )rQ  is_floating_pointrH  s    r   low_precision_fprX    s    >>Q:5#:#::r   c                    t        | t              sy| j                  }t        |t        j                        rt	        |      S dS rw  )r   rT   rI  r   rX  )r   rI  s     r   low_precision_fp_varrZ    s6    c;'IIE&0&DE"O%Or   c                  <     e Zd Z	 d	 	 	 	 	 	 	 	 	 d fdZd Z xZS )TritonCSEVariablec                n    t         |   ||||       t               | _        |J d       |J d       y )Nr   z!TritonCSEVariable must have dtypez!TritonCSEVariable must have shape)super__init__r   r   )r   r   boundsrI  r   	__class__s        r   r`  zTritonCSEVariable.__init__  sH     	vuE:*4, E"EE  E"EE r   c                F   |D ]  }t        |t              r&| j                  j                  |j                         9t        |t        j
                        sTt        j                  D ]6  }t        ||      s| j                  j                  t        |    dg          y )Nr  )
r   r\  r   updater   r   r   r   r   r   )r   r   r  kwargsargr   s         r   update_on_argsz TritonCSEVariable.update_on_args  s     
	C#01%%cmm4C. *55 D%c40--*T2B1C4/H.IJ
	r   r   )
r   r   ra  zValueRanges[Any]rI  torch.dtyper   rr   r   r   )r   r   r   r`  rg  __classcell__rb  s   @r   r\  r\    sH     !%FF !F 	F
 F 
Fr   r\  c                     ddl m}   |        S )Nr   rn   )!torch._inductor.dtype_propagationro   rn   s    r   get_dtype_handlerrm    s    L%''r   c                0     dddfdd fd}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    c                    t         j                  j                   xr> t        | t              xr, | j
                  t        j                  t        j                  fv S r   )	r"   r   codegen_upcast_to_fp32r   rT   rI  r   float16bfloat16)r   s    r   needs_upcastz*maybe_upcast_float32.<locals>.needs_upcast  sD    444 =3,=		emmU^^<<	
r   c                (     |       rdnd}|  | S )N.to(tl.float32)r   r   )r   upcast_stringrs  s     r   maybe_upcast_argz.maybe_upcast_float32.<locals>.maybe_upcast_arg  s!    -9#->)B}o&&r   c                H     t         j                          d fd}|S )Nc                    | D cg c]
  } |       }}|j                         D ci c]  \  }}| |       }}} |i |}xr6 t        fdt        j                  | |j	                               D              }|sd n# t        t               j                        | i |}	|	t        j                  d fv}
|
r|	dt        |	       dnd}| | S c c}w c c}}w )Nc              3  .   K   | ]  } |        y wr   r   )r   r   rs  s     r   r  zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>&  s      6&)S!6s   .to(r  r   )itemsry   	itertoolschainvaluesgetattrrm  r   r   r   rF   )r  re  rf  upcast_argskeyvalupcast_kwargsr\  any_needs_upcastresult_dtypeneeds_downcastdowncast_stringr   r   rw  rs  s               r   wrappedz8maybe_upcast_float32.<locals>.decorator.<locals>.wrapped  s   <@AS+C0AKAHNWHCS"23"77WMW ;8-8F-  # 6-6__T6==?-S6 3
 ( @W.0$--@$Q&Q 
 *%--1FFN "l&> {<013 
 Xo.//' BWs
   CCr  )r   r   )r   r  r   rw  rs  s   ` r   	decoratorz'maybe_upcast_float32.<locals>.decorator  s$    &&t^<	0 	0. r   r  r  )r   Callable[..., Any]r   r  r   )r   r  rw  rs  s   ` @@r   maybe_upcast_float32r  
  s    
': r   c                     e Zd ZdZ ej
                  ej                        Ze	 	 dO	 	 	 dPd       Z	edQd       Z
ed        Zed        Ze e       d               Zed	        Zed
        Ze e       d               Ze e       d               Ze e       d               Ze e       d               Zed        Zed        Zed        Zed        Zed        Zedej8                  dddd       Ze e       d               Ze e       d               Zed        Z ed        Z!e e       d               Z"e e       d               Z#e e       d               Z$e e       d               Z%e e       d               Z&e e       d                Z'e e       d!               Z(e e       d"               Z)e e       d#               Z*e e       d$               Z+e e       d%               Z,e e       d&               Z-e e       d'               Z.e e       d(               Z/e e       d)               Z0e e       d*               Z1e e       d+               Ze e       d,               Z2ed-        Z3ed.        Z4ed/        Z5ed0        Z6ed1        Z7ed2        Z8ed3        Z9ed4        Z:ed5        Z;ed6        Z<ed7        Z=ed8        Z>ed9        Z?ed:        Z@e e       d;               ZAe e       d<               ZBe e       d=               ZCe e       d>               ZDe e       d?               ZEed@        ZFe e       dA               ZGe e       dB               ZHe e       dC               ZIe edDE      dF               ZJe edDE      dG               ZKe e       dH               ZLe e       dI               ZMedJ        ZNedK        ZOe e       dL               ZPedM        ZQe e       dN               ZRy)RTritonOverrideszEMap element-wise ops to Triton e.g., ops.to_dtype(x,...) -> x.to(...)NTc                Z   	 	 	 	 	 	 dd}|>t         |||      t        j                  j                        t        j                  _        |t        j
                  k(  rd|  dS |t        j                  k(  r||j                  s||  dS |rt        |      }nt        |      }|  d| dS )Nc                   | |k(  ryt         j                  t         j                  f}| |v r||v r| |k7  rJ d       | t         j                  k(  s|t         j                  k(  ry| t         j                  k(  s|t         j                  k(  ryy)Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!rO  r!   )r   float8_e4m3fnfloat8_e5m2)	src_dtype	dst_dtype
fp8_dtypess      r   _get_min_elements_per_threadz>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadG  s     I% ##!!J Z'+*U U	U 
 E---e>O>O1OE///9@S@S3Sr   r  z != 0)z.to(tl.int16).to(tl.uint8)r{  r  )r  rh  r  rh  r   r  )
rz   rL   r   min_elem_per_threadr   r   uint8rW  rJ  rM  )r?  rI  r  use_compute_typesr  	out_dtypes         r   to_dtypezTritonOverrides.to_dtype@  s    	"	/:		6   ,/,Y>,,,AHH(
 EJJqc= ekk!!i&A&AYEV S233+E2I)%0ID1%%r   c                    |j                   |j                   k(  sJ | j                  |k7  r|  dt        |       d} |  dt        |       d}t        |      |k7  r| dt        t        |             d}|S )Nr{  r  z, bitcast=True))rQ  rI  rF   rH   )r?  rI  r  outs       r   to_dtype_bitcastz TritonOverrides.to_dtype_bitcast}  s    !!U^^333 77i#T+i013A4E*+?;u%.Ek*=e*DEFaHC
r   c           	         t         j                  j                  |      }t         ||             }t	        |      }|dk(  r|S | dk  r#|j
                  sd|dd   }d| d| d| d| d	S d| d| d| dS )	Nz
tl.float32r   ztl.rO  tl.full(r  r  r  )r   _prims_commondtype_to_typer`   rJ  rP  )r  rI  r   type_
triton_valrF   triton_signed_types          r   _shaped_constantz TritonOverrides._shaped_constant  s    ##11%8"5<0
)%0,& 19U__#&{12&7!8eWBzl"5G4Hk]Z[\\eWBzl"[MCCr   c                *    | j                  ||g       S )Nr^  )r  )r   r  rI  s      r   constantzTritonOverrides.constant  s    ##E5#;;r   c                    d|  dS )Nr$  r  r   r?  s    r   abszTritonOverrides.abs       aS""r   c                   t        | dd       }t        |dd       }|t        j                  k(  r-|t        j                  k(  rt        j                  r
d|  d| d}n	d|  d| d}t        j
                  j                         r	d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j                  fv r| dt        |       d}|S )NrI  ztriton.language.div_rn(r  r  r  r  r{  )r  r   r   r"   emulate_divison_roundingr  r  rZ  rm  truedivrq  rF   )r?  yx_dtypey_dtyper  r  s         r   r  zTritonOverrides.truediv  s    !Wd+!Wd+ u}}$5==(// ,A3b15CaSA3a.C
 99!!#aSA3a.C"&:1&=)+33Aq9IU]]EMM::T+i"8!9;
r   c                    d|  d| d}t        |       st        |      rMt               j                  | |      }|t        j                  t        j
                  fv r| dt        |       d}|S )Nr  r  r  r{  )rZ  rm  modr   rq  r   rF   )r?  r  r  r  s       r   r  zTritonOverrides.mod  sl    !Cs!n"&:1&=)+//15IU]]EMM::T+i"8!9;
r   c                :    t         j                  rd|  dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        ztl_math.exp(r  zlibdevice.exp()r"   use_fast_mathr  s    r   expzTritonOverrides.exp  s+     !!A&&#A3a((r   c                    d|  dS )Nzlibdevice.exp2(r  r   r  s    r   exp2zTritonOverrides.exp2       !1%%r   c                    d|  dS )Nzlibdevice.expm1(r  r   r  s    r   expm1zTritonOverrides.expm1       "!A&&r   c                V    t         j                  j                         rd|  dS d|  dS )Nzlibdevice.sqrt(r  ztl.sqrt_rn()r   r  r  r  s    r   sqrtzTritonOverrides.sqrt  s2     99!!#$QCq))QCq!!r   c                   t         j                  j                  }|dk(  ry|dk(  r	d|  d|  dS |dk(  r|  dS |8t        j                  t        j
                  d	t        j                        |       S t        d
|      )Ncompile_errorzcompile error!runtime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r  accuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r"   r   inject_relu_bug_TESTING_ONLYopsmaximumr  r   rR  rQ  )r?  bugs     r   reluzTritonOverrides.relu  s    mm88/!#O# 8s:YZ[Y\\]^^JS:[;;s||Au{{;Q?? LSGT r   c                    d|  d| dS )Nztriton_helpers.minimum(r  r  r   r  r  s     r   minimumzTritonOverrides.minimum      (2aS22r   c                    d|  d| dS )Nztriton_helpers.maximum(r  r  r   r  s     r   r  zTritonOverrides.maximum	  r  r   c                    d|  d| d| dS )Nr  r  r  r   )r  r  r
  s      r   wherezTritonOverrides.where  s    1#Rs"QCq))r   c           	       	
 t         j                  j                  sJ | |}}d }d } ||      r ||       }  ||      r ||      }	 	 	 	 	 	 d	
fd}t        t         j                  j	                               dk\  sJ d       t        t        j                  t        j                           	t        t        j                  t        j                           
t        t        j                  t        j                           t        t        j                  t        j                           t         j                  j                  j                  t         j                  j                   || t!        | j"                        
g      | j$                  
f      } t         j                  j                  j                  t         j                  j                   ||t!        |j"                        	g      |j$                  	f      }t&        j(                  j*                  j,                  j.                  dk(  rd}nd}d	|  d
| d| dS )a  
        Triton code generation for lowering ops.dot to tl.dot.

        The logic is as follows:

        1. Downcasting for performance
           If the data was previously upcasted to fp32, we downcast back to the
           original dtype (e.g., fp16 or bf16) for better performance. While
           surrounding operations may run in fp32, matmul itself is executed at the
           original precision to optimize throughput.

        2. Handling non-constant reduction masks
           If the reduction mask is not constant and there was any operation between
           tl.load and tl.dot, we zero out regions outside the mask using
           tl.where(r0_mask, val, 0).
           This ensures that values outside the mask do not contribute to the dot
           product, preventing incorrect results.

        3. Shape alignment for tl.dot
           We massage shapes to match the tl.dot requirement of (Y, R) x (R, X).
           Current codegen eagerly broadcasts tl.arange to create unique axes. We
           reshape, transpose, or broadcast to align with the (Y, R) x (R, X) shape.
           We avoid using 3D dot ((Z, Y, R) x (Z, R, X)) because 3D tl.dot has
           poor performance. During batched matmul (bmm), we keep ZBLOCK=1 and call
           the 2D dot kernel instead.
        c                x   t        t        t        | j                              syt        j
                  j                  d   }|j                  sJ t        j
                  j                  |      ryt        j
                  j                  j                  j                         D ]  \  }}|| k(  sd|v sd|v s y y)NFr<  ztl.loadz	other=0.0T)ry   r  rB   r   rL   r   rV  rM  _has_constant_maskr   _cacher|  )r   reduction_rangekvs       r   is_where_neededz,TritonOverrides.dot.<locals>.is_where_needed0  s    s.>?hh2226O"//// xx**?;
 ++113 !18	Q;!3C ! r   c                   t         j                  j                  d| j                        }t        j
                  j                  D cg c]  }|j                  r|j                   d }}t        |      dk(  sJ d       t        j                  |d   | |      }t        j
                  j                  j                  t        j
                  j                  || j                  | j                        S c c}w )Ndotr  rN   z'don't tile reduction when native matmulr   rI  r   )r#   	Reductiondefault_valuerI  rL   r   rV  rM  r   r   TritonKernelOverridesr  r   generatecomputer   )r   defaultr   reduction_mask	where_vars        r   
where_condz'TritonOverrides.dot.<locals>.where_condE  s    ll00		BG HH00$$ ;;-t$N  ~&!+V-VV+-33N14EsGTI88<<((  )399CII )  s   "C3c                   	|v r|D cg c]  }|	k(  rdn| }}|gk(  rE|vsJ d       ddg}|v r|d<   |v r|d<   t        | ||      } |gk7  }|rd|  d d d} | S |gk(  rK|vsJ d	       ddg}|v r|d<   |v r|d<   t        | ||      } d
|  d} |gk7  }|rd|  d d d} | S t        c c}w )a  
            Generate a reshape, transpose, and broadcast for the tl.dot.
            tl.dot requires specific shape requirement : (Y,R) x (R,X)
            but the current triton codegen eagerly broadcast the tl.arange so
            it needs to be reshaped to meet the requirement.

            This is done by three steps.
            1. remove the empty dimension (dim with size 1) and make it 2d with tl.reshape
            2. permute the dimension if needed (e.g., (X,R) -> (R,X)) with tl.trans
            3. broadcast if needed with broadcast_to.
                - This shows up when matmul operand is broadcasted with torch.expand/repeat.
                - e.g., torch.rand((16,)).expand(16,16) @ B

            e.g., (Y,1,R), (Y,R) -> tl.reshape(var, (Y,R))
            e.g., (1,X,R), (R,X) -> tl.trans(tl.reshape(var, (X,R)))
            e.g., (1,X,1), (R,X) -> tl.broadcast_to(tl.trans(tl.reshape(var, (X,1))), (R,X))

            TODO : eventually we want to remove this function when lazy broadcasting arrives
            r   z&left tl.dot operand cannot depend on xr   rN   r  z, (r  r  z'right tl.dot operand cannot depend on yr  r  )r  NotImplementedError)
r  r  r)  rX  shape_2dbroadcast_neededRBLOCKr   r   r   s
         r   #reshape_transpose_broadcast_for_dotz@TritonOverrides.dot.<locals>.reshape_transpose_broadcast_for_dote  s|   6 &JW X3v3!> X Xvv..]2 <2  :]*"(HQK]*"(HQK 'umXF $,/?#? #.ugS6("ME4 L1  00]2 =2  :]*"(HQK]*"(HQK 'umXF $E7!, $,/?#? #.ugS6("ME L *)W !Ys   C r   ztl.dot can only do mm and bmmr  tf32ieeeztl.dot(r  z, input_precision="z"))r  r%  r)  r%  r   r   )rL   r   is_native_matmulr   dense_size_listr   r   r   r   r   r   r   r   r   r  r  rR  r   rI  r   backendsr  matmulfp32_precision)r  r  orig_aorig_br  r  r  input_precisionr  r   r   r   s           @@@@r   r  zTritonOverrides.dot  s   8 xx((((A	*	6 6"1A6"1AI	/I	 .I	 	I	 I	V 188++-.!3T5TT3]..t{{;<]..t{{;<]..t{{;<]..t}}=>HHLL!!HH/4=66BRS''6"	 " 
 HHLL!!HH/4=66BRS''6"	 " 
 >>%%44>$O$O2aS 3O3DBGGr   rN   )constraintsrI  is_purepackc                    t        |      }dj                  |D cg c]  }t        |       c}      }|#dj                  dg|D 	cg c]  }	d c}	z         }d|  d| d| d| d| d	| d
S c c}w c c}	w )Nr  z=rr  ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r  )rJ  r  r   )
asmr  rI  r  r  inputsrF   i
input_refsr  s
             r   inline_asm_elementwisez&TritonOverrides.inline_asm_elementwise  s     *%0YY71A78
))TF6-Bac-B$BCK,SEk]$zlR[\g[hhrszr{  |C  DH  CI  IJ  K  	K  8-Bs   A.	A3
c                    d|  dS )Nztl_math.cos(r  r   r  s    r   coszTritonOverrides.cos  r  r   c                    d|  dS )Nztl_math.sin(r  r   r  s    r   sinzTritonOverrides.sin  r  r   c                    t        d      )Nz/ops.index_expr not implemented outside a kernelr  )r   r   rI  s      r   
index_exprzTritonOverrides.index_expr  s    !"STTr   c                    t        d      )Nz+ops.masked not implemented outside a kernelr  )r  bodyothers      r   maskedzTritonOverrides.masked  s    !"OPPr   c                    d|  dS )Nzlibdevice.lgamma(r  r   r  s    r   lgammazTritonOverrides.lgamma       #1#Q''r   c                    d|  dS )Nzlibdevice.erf(r  r   r  s    r   erfzTritonOverrides.erf        s!$$r   c                    d|  dS )Nzlibdevice.cosh(r  r   r  s    r   coshzTritonOverrides.cosh  r  r   c                    d|  dS )Nzlibdevice.sinh(r  r   r  s    r   sinhzTritonOverrides.sinh  r  r   c                    d|  dS )Nzlibdevice.acos(r  r   r  s    r   acoszTritonOverrides.acos  r  r   c                    d|  dS )Nzlibdevice.acosh(r  r   r  s    r   acoshzTritonOverrides.acosh  r  r   c                    d|  dS )Nzlibdevice.asin(r  r   r  s    r   asinzTritonOverrides.asin  r  r   c                    d|  dS )Nzlibdevice.asinh(r  r   r  s    r   asinhzTritonOverrides.asinh  r  r   c                    d|  d| dS )Nzlibdevice.atan2(r  r  r   r?  r  s     r   atan2zTritonOverrides.atan2       "!Bqc++r   c                    d|  dS )Nzlibdevice.atan(r  r   r  s    r   atanzTritonOverrides.atan  r  r   c                    d|  dS )Nzlibdevice.atanh(r  r   r  s    r   atanhzTritonOverrides.atanh  r  r   c                    d|  d| dS )Nzlibdevice.copysign(r  r  r   r  s     r   copysignzTritonOverrides.copysign  s     %QCr!A..r   c                    d|  dS )Nzlibdevice.erfc(r  r   r  s    r   erfczTritonOverrides.erfc$  r  r   c                    d|  dS )Nzlibdevice.erfinv(r  r   r  s    r   erfinvzTritonOverrides.erfinv)  r
  r   c                    d|  d| dS )Nzlibdevice.hypot(r  r  r   r  s     r   hypotzTritonOverrides.hypot.  r  r   c                    d|  dS )Nzlibdevice.log10(r  r   r  s    r   log10zTritonOverrides.log103  r  r   c                    d|  dS )Nzlibdevice.log2(r  r   r  s    r   log2zTritonOverrides.log28  r  r   c                    d|  d| dS )Nzlibdevice.nextafter(r  r  r   r  s     r   	nextafterzTritonOverrides.nextafter=  s     &aS1#Q//r   c                    |  d| S Nr  r   r  s     r   logical_andzTritonOverrides.logical_andB      Cs|r   c                    |  dS )Nz == 0r   r  s    r   logical_notzTritonOverrides.logical_notF  s    E{r   c                    |  d| S Nz | r   r  s     r   
logical_orzTritonOverrides.logical_orJ  r3  r   c                    d|  d| dS )Nr   ^ r  r   r  s     r   logical_xorzTritonOverrides.logical_xorN  s    1#S1~r   c                    |  d| S r1  r   r  s     r   bitwise_andzTritonOverrides.bitwise_andR  r3  r   c                    d|  S )N~r   r5  s    r   bitwise_notzTritonOverrides.bitwise_notV  s    1#wr   c                    |  d| S r8  r   r  s     r   
bitwise_orzTritonOverrides.bitwise_orZ  r3  r   c                    |  d| S )Nr;  r   r  s     r   bitwise_xorzTritonOverrides.bitwise_xor^  r3  r   c                    |  d| S )Nz << r   r  s     r   bitwise_left_shiftz"TritonOverrides.bitwise_left_shiftb      D}r   c                    |  d| S )Nz >> r   r  s     r   bitwise_right_shiftz#TritonOverrides.bitwise_right_shiftf  rH  r   c                     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r  r  r   seedr   s     r   randzTritonOverrides.randj  s%    VHO,$r&++r   c                     d| d}d|  d| dS )Nr  rL  z	tl.randn(r  r  r   rM  s     r   randnzTritonOverrides.randno  s%    VHO,4&6(!,,r   c           	     ,    d| d}d|  d| d| d| d	S )Nr  rL  ztriton_helpers.randint64(r  r  r   )rN  r   lowhighs       r   	randint64zTritonOverrides.randint64t  s1    VHO,*4&6("SED6KKr   c                    t        d      )Nz.ops.load_seed not implemented outside a kernelr  )r   r   s     r   	load_seedzTritonOverrides.load_seedy  s    !"RSSr   c                    d|  dS )Nzlibdevice.rsqrt(r  r   r  s    r   rsqrtzTritonOverrides.rsqrt}  r  r   c                    d|  dS )Nzlibdevice.log1p(r  r   r  s    r   log1pzTritonOverrides.log1p  r  r   c                    d|  dS )Nzlibdevice.tan(r  r   r  s    r   tanzTritonOverrides.tan  r  r   c                R   t         j                  j                  j                  j	                  |       }|rt        |d      r|j                  }nd }t        j                  rBt        j                  j                  r(t               dkD  r|t        j                  k7  r|d|  dS d|  dS )NrI  )r      zlibdevice.fast_tanhf(r  zlibdevice.tanh()rL   r   r   r   getr   rI  r"   r  r   r  r  r   r   )r?  r   rI  s      r   tanhzTritonOverrides.tanh  s     ((,,**..q1ww0MMEE  !!"$v-&! +1#Q//$QCq))r   c                    d|  dS )Nztl.sigmoid(r  r   r  s    r   sigmoidzTritonOverrides.sigmoid  s     QCq!!r   c                    d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0r   r  s    r   signbitzTritonOverrides.signbit  s#     "!L3NqcQUV	
r   c                    d|  d| dS )Nzlibdevice.fmod(r  r  r   r  s     r   fmodzTritonOverrides.fmod  s     !2aS**r   c                    d|  d| dS )Nr  r  r  r   r  s     r   powzTritonOverrides.pow  s      s"QCq))r   c                    d|  dS )Nztl_math.log(r  r   r  s    r   logzTritonOverrides.log  r  r   F)r   c                    d|  dS )Nzlibdevice.isinf().to(tl.int1)r   r  s    r   isinfzTritonOverrides.isinf       "!M22r   c                    d|  dS )Nzlibdevice.isnan(rm  r   r  s    r   isnanzTritonOverrides.isnan  ro  r   c                    d|  dS )Nzlibdevice.nearbyint(r  r   r  s    r   roundzTritonOverrides.round  s     &aS**r   c                    d|  dS )Nr  r  r   r  s    r   floorzTritonOverrides.floor  r  r   c                H    |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nr  r  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r  r   )r  r  r  rems       r   floordivzTritonOverrides.floordiv  sV    
 D}3qclA3j+;C5vVTXSYY\]a\bbcddr   c                f   t        j                  dt        j                        }t        j                  t        j
                  ||       t        j                        }t        j                  t        j
                  | |      t        j                        }t        j                  ||      }| d|  dS )Nr   r{  .dtype))r  r  r   rR  r  ltrL  sub)r?  zleftrightr}  s        r   signzTritonOverrides.sign  su    LLEKK(||SVVAq\EJJ7cffQlUZZ8ggdE"d1#W%%r   c                    d|  dS )Nr  r  r   r  s    r   trunczTritonOverrides.trunc  r  r   c                    |  d| S )Nr  r   r  s     r   truncdivzTritonOverrides.truncdiv  s     D}r   c                    d|  dS )Nr  r  r   r  s    r   ceilzTritonOverrides.ceil  r  r   NT)rI  rh  r  Optional[torch.dtype])rI  rh  r  rh  )Sr   r   r   r   mathr-  e_LOG_2_Estaticmethodr  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r  r  r   r  r  r	  r  r  r  r  r  r  r  r  r  r!  r#  r%  r'  r)  r+  r/  r2  r6  r9  r<  r>  rA  rC  rE  rG  rJ  rO  rQ  rU  rW  rY  r[  r]  ra  rc  re  rg  ri  rk  rn  rq  rs  ru  ry  r  r  r  r  r   r   r   r  r  ;  s   Otyy H ,0	:&:& ):& :&x    D D" < < #  #  8   )  ) &  & '  ' "  "  " 3 3 3 3 * * xH xHt "&emmTPQK K #  # #  # U U Q Q (  ( %  % &  & &  & &  & '  ' &  & '  ' ,  , &  & '  ' /  / &  & (  ( ,  , '  ' &  & 0  0                     , , - - L L T T '  ' '  ' %  % *  *& "  " 
 
 +  + *  * #  # /3 0 3 /3 0 3 +  + '  ' e e & & '  '  
 &  &r   r  r   c                       e Zd ZdZ fdZeej                  d               Zed        Z	ed        Z
ed        Zed        Zed        Ze	 	 	 	 	 	 	 	 	 	 d
d	       Z xZS )r  a   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                D    t        |   |i | | j                          y r   )r_  r`  _setup_libdevice_routing)r   r  re  rb  s      r   r`  zTritonKernelOverrides.__init__  s#    $)&) 	%%'r   c                   ddl m t        j                  j                  j
                  D ]  }t        | |      sJ t        | |      }fd}|dk(  rDt        d      sJ t        j                  |||      }||_
        t        | |t        |             kd }t        j                  |||      }||_
        t        | |t        |              y)z<Set up routing to libdevice implementations for fp64 inputs.r   )OpDecompositionsc                    | j                   t        j                  k7  r ||       S  t        |      |       j                  S r   )rI  r   r   r  r  )r?  _original_impl_fn_namer  s      r   decomposition_routerzLTritonKernelOverrides._setup_libdevice_routing.<locals>.decomposition_router  s9    77emm+)!,,>7#3X>qAGGGr   rc  )r  r  c                ^    | j                   t        j                  k(  r	d| d|  dS  ||       S )Nz
libdevice.r  r  )rI  r   r   )r?  r  r  s      r   dtype_routerzDTritonKernelOverrides._setup_libdevice_routing.<locals>.dtype_router  s2    77emm+'z1#Q77)!,,r   N)torch._inductor.codegen.commonr  r   r  r%   op_requires_libdevice_fp64r   r  	functoolspartialr   setattrr  )r   fn_nameoriginal_implr  fnr  r  s         @r   r  z.TritonKernelOverrides._setup_libdevice_routing  s    
 	D,,GG 	4G3(((#C1MH )#/;;;&&(QX &Wl2&67- ""]WB "BKC,r"23;	4r   c                r    t         j                  j                         }dg|z  }| j                  |||      S )NrN   r^  )rL   r   r   r  )r   r  rI  r   r   s        r   r  zTritonKernelOverrides.constant*  s9    
 xx**,d
##E5#>>r   c                   t         j                  j                  |dd       }t        |t              sJ |j
                  r|j
                  }nt        j                  |j                        }t         j                  j                         }|t        j                  t        j                  fvr|n|}t        j                  j                  }	 dt        j                  _        t         j                  j                   j#                  t         j                  j$                  |j&                  t)        |      ||      }|t        j                  _        |t        j                  t        j                  fvrit         j                  j                   j#                  t         j                  j$                  | j+                  ||      t-        |      |j.                        }n|}|j0                  D ]l  }t3        |t4        j6                        st        j8                  |t         j                  j                   j:                  |j<                     j>                        }n ||k7  r_t         j                  j                   j#                  t         j                  j$                  | j+                  ||      ||j.                        }|j@                  |_         |S # |t        j                  _        w xY w)NF	block_ptrtma_compatibility_checkerra  rI  r   r  )!rL   r   indexingr   r   r   r   r   r   get_index_dtype_as_torch_dtyper   rR  int64r"   test_configsruntime_triton_dtype_assertr   r  r  r   r=   r  rH   r   r   r   r   r   promote_typesr   r   rI  r   )	r   r   rI  r  r   r  origr   	index_vars	            r   r  z TritonKernelOverrides.index_expr3  s1   88$$ET % 
 (O444   ))E!11(..AE hh==?u{{EKK&@@k "">>
	C>CF;((,,''  "",T2 ( C ?CF;ekk22((,,''  S%()%0ii	 ( C  E!.. 	!)TXX6!//qxx||77	GMME #hhll++HH$$LLk2%))	 ,  !**
= ?CF;s   A.K K%c           
        | zt         j                  j                  `t        j                  j
                  j                  t        j                  j                  |  dt         j                  | j                        } |j                  j                  d      }|sJ d       d}|D ]>  }|j                  D ]-  }|j                  dk7  st        |j                  d         s+d	} > @ |rd n|}t        j                  j                  | |
      5 } |       }	d d d        |r	j                   j"                  rt        |      }t        j                  j
                  j                  t        j                  j                  d|	 dt%        |       d|	 dt'        j(                  |      |	j*                  |	j                        }t-        j.                  |	|      }
n	}
|
j0                  j3                         |
S # 1 sw Y   xY w)N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrN   Tr  r  z.shape, r  r{  r  )r   r  r  rL   r   r   r  r  r   r   rA  
find_nodesr  targetrj   
mask_loadsra  is_boolr`   r    wraprI  r  r  r   discard)r  r  r  nodes
need_wherenoderf  r  new_maskr\  r  s              r   r  zTritonKernelOverrides.maskedq  s    1 1 =88<<((  &%jjjj	 ) D 

%%%2AAAu
  	Dyy ::'+CCHHQK+P!%J	 #XX  U 3 	xVF	 }}$$UHHLL))  6((=+?*@6('R"''.llll * E ))Hfe4CCh'
)	 	s   G88Hc                    t         j                  j                  j                  |       }d| dt         j                  j                  j	                  d|       dS )Ntl.load( + load_seed_offsetr  )rL   r   r  inputseed_offset)r   r   r   s      r   rW  zTritonKernelOverrides.load_seed  sI    hhmm!!$'se3qxx}}889KVTUUVW	
r   c                0   d|  d}t         j                  j                  j                  |      x}r|S t         j                  j                  j	                  | j
                  | j                        }t         j                  j                  j	                  t        j                  | j                        }t         j                  j                  j                  | d| d|  d       t         j                  j                  j                  |||f       ||fS )Nzfrexp(r  r  r  z = triton_helpers.frexp()rL   r   r   try_getnewvarrI  r   r   rR  r  r   put)r?  	cache_keycse_valmantissaexponents        r   frexpzTritonKernelOverrides.frexp  s    QCqM	hhll**95575N88<<&&QWWAGG&D88<<&&U[[&H	""j8*$<QCqA	
 	
Xx$89(##r   c                    t         r   r  )r   r   r  
extra_metas       r   partial_accumulatez(TritonKernelOverrides.partial_accumulate  s
     "!r   )
r   r   r   r   r  rT   r  dict[str, Any]r   r   )r   r   r   r   r`  r   r  cacher  r  r  r  r  rW  r  r  ri  rj  s   @r   r  r    s    ( __"4  "4H ? ? ; ;z , ,\ 
 
 $ $ """ " #	"
 
" "r   r  c                  H    e Zd ZU dZded<   ded<   ddZdddd	Zd
 Zd Zy)HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersc                     i | _         g | _        y r   )r  r  r   s    r   r`  zHelperFunctions.__init__  s    !!#r   _triton_helper_fn	base_namec                   | j                   j                  |      }||S | t        | j                         }|| j                   |<   | j                  j	                  |j                  |             |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        )r   )r  r`  r   r  r  r  )r   template_coder  existing_namer   s        r   addzHelperFunctions.add  sw     ,,00?$  S!7!789:.2]+%%m&:&:&:&EFr   c                ,    t        | j                        S r   )iterr  r   s    r   __iter__zHelperFunctions.__iter__  s    D**++r   c                     | j                   |   S r   )r  )r   rs  s     r   __getitem__zHelperFunctions.__getitem__  s    %%c**r   Nr   r   )r  r   r   r   )	r   r   r   r   r   r`  r  r  r  r   r   r   r  r    s+    -##  $ 4G ,,+r   r  c                     e Zd ZU dZ ej
                  e      Zded<    ej
                  e      Z	ded<    ej
                  e      Z
ded<    ej
                  e      Zded<   ej                   G d d	             Zej                   G d
 de             Zej                   G d de             ZddZ	 	 	 	 	 	 ddZddZy)r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr@  r   r0  r2  r5  c                      e Zd ZU ded<   ded<    ej
                  d      Zded<   d Zed        Z	e
e	 	 	 	 	 	 dd	              Zd
 Zd Zy)r*  r#  original_stridesr  F)initr  c                   t        | j                        dkD  sJ t        | j                        t        | j                        k(  sJ t        t	        t        | j                                    }| j                  |k(  | _        t        | j                        D ci c]  \  }}||
 }}}t	        t        |            D cg c]  }||   	 c}| _        y c c}}w c c}w rg  )r   r  r  rR  rT  _is_identity	enumerater  )r   identity_sort_idxr  r  sorted_dims_by_strides_maps        r   __post_init__z*BlockParameters.StrideSorter.__post_init__  s    t,,-111t}}%T-B-B)CCCC $U3t/D/D+E%F G $1B BD <ET]];S)T41a!Q$)T&)T s#=>?$ +1-$D  *U$s   CCc                    | j                   S r   )r  r   s    r   r  z(BlockParameters.StrideSorter.is_identity  s    $$$r   c                     y)zBCreate a `StrideSorter` that can be used to sort block parameters.Nr   r   r  r>  s      r   r]  z#BlockParameters.StrideSorter.create	  s    r   c                b    | j                   s| j                  D cg c]  }||   	 c}S |S c c}w r   r  r  r   attrr  s      r   sortz!BlockParameters.StrideSorter.sort  .    ##)-7AQ77K 8   ,c                b    | j                   s| j                  D cg c]  }||   	 c}S |S c c}w r   r  r  s      r   r  z#BlockParameters.StrideSorter.revert  r  r  Nr  zlist[Union[int, sympy.Expr]]r>  rp   r   r*  )r   r   r   r   dataclassesfieldr  r  r  r  r   r   r]  r  r  r   r   r   StrideSorterzBlockParameters.StrideSorter  s    ##%6[%6%6E%BB	 
	% 
	% 
		U#?	ULT	U)	U 
 
	U
	
	r   r  c                  <     e Zd Z fdZe	 	 	 	 	 	 dd       Z xZS )$BlockParameters.IdentityStrideSorterc                "    t         |           y r   r_  r  r   rb  s    r   r  z2BlockParameters.IdentityStrideSorter.__post_init__      G!#r   c           
     L     | |t        t        t        |                        S )Nr  r  )rR  rT  r   r  s      r   r]  z+BlockParameters.IdentityStrideSorter.create  s'     !1eC(8$9:; r   r  )r   r   r   r  r   r]  ri  rj  s   @r   IdentityStrideSorterr    s4    	$ 
	#?	LT	)	 
	r   r  c                  @     e Zd ZdZ fdZe	 	 	 	 	 	 dd       Z xZS )+BlockParameters.TensorDecriptorStrideSorterzT
        Sorts BlockParameters dimensions with strides in descending order.
        c                "    t         |           y r   r  r  s    r   r  z9BlockParameters.TensorDecriptorStrideSorter.__post_init__.  r  r   c                    t        t        t        |                  }	 t        j                  ||d      } | ||      S # t
        $ r |}Y w xY w)a  
            If the strides are not all known constants or if the strides are already
            sorted in descending order, return identity sort.

            For example if block_shape @ strides is [ZBLOCK, XBLOCK, YBLOCK] @ [8, 1, 16]
            The indices to sort the strides in descending order will be [2, 0, 1].
            The indices to revert back to the original order will be [1, 2, 0].
            T)reverser  )rR  rT  r   r%   rP  rQ  )r   r  r>  identity_sortr  s        r   r]  z2BlockParameters.TensorDecriptorStrideSorter.create1  sa     !s+;'<!=>M	) !,,Y8HRVW
 !1! 	 " )()s   A AAr  )r   r   r   r   r  r   r]  ri  rj  s   @r   TensorDecriptorStrideSorterr
  (  s9    		$ 
	#?	LT	)	 
	r   r  c                    t        |       }t        d | |fD              \  }} |di |D ci c]  }|||   ||   z    c}S c c}w )z0
        Concatenates block parameters.
        c              3  F   K   | ]  }t        j                  |        y wr   )r  asdictr  s     r   r  z*BlockParameters.__add__.<locals>.<genexpr>S  s     Bq[''*Bs   !r   )r  r   )r   r  r   r  r  r  s         r   __add__zBlockParameters.__add__N  sR     4jBT5MBB19a8sc1S6AcF?*8998s   Ac                    |j                  | j                  |      }t        di t        j                  |       j                         D ci c]  \  }}||j                  |       c}}}||fS c c}}w )z
        Sort `BlockParameter` with stride_sorter_cls. Returns block parameters
        as well as a `StrideSorter` which contains information on how the sort
        can be reverted.
        )r>  r   )r]  r2  r   r  r  r|  r  )r   r=  r>  r+  r  r  r!  s          r   rE  z,BlockParameters.maybe_sort_with_stride_orderV  s~     *000S  
 !, 2 24 8 > > @C ]'',,
 }$$s   A1c                    fd}t        di t        j                  |       j                         D ci c]  \  }}| ||       c}}S c c}}w )zA
        Remove dimensions where removable_dims is True.
        c                R    t        |       D cg c]	  \  }}|s| c}}S c c}}w r   )r  )ititemis_removableremovable_dimss      r   filter_dimsz0BlockParameters.remove_dims.<locals>.filter_dimsl  s3     +.b.*A&D,#   s   #r   )r   r  r  r|  )r   r  r  r  r  s    `   r   rD  zBlockParameters.remove_dimsg  sO    
	  
5@5G5G5M5S5S5UVcsK$$V
 	
Vs   AN)r  r   r   r   )r=  ztype[StrideSorter]r>  rp   r   z4tuple[BlockParameters, BlockParameters.StrideSorter])r  r'  r   r   )r   r   r   r   r  r  rR  r   r   r0  r2  r5  	dataclassr  r  r  r  rE  rD  r   r   r   r   r     s     0k//EEE$5K$5$5d$KK!K 1 1 1$ GGG 1 1 1$ GGG& & &P |   #l # #J:%!3%@H%	=%"
r   r   c                  *    e Zd ZdZd ZddZd Zd Zy)"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                    || _         g | _        g | _        t        j                  t        j
                        | _        d| _        d| _        y rg  )	r  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   r  s     r   r`  z+CooperativeReductionWorkspaceCache.__init__~  s@    	*66{7H7HIr   c                    | j                   j                  |      }|r|j                         S | j                  j	                  |d      \  }}}| j
                  j                  |||f       ||fS rw  )r%  r`  popleftr  	workspacer   r  )r   nbytescachedws_namer  	ws_offsets         r   allocatez+CooperativeReductionWorkspaceCache.allocate  sk    %%))&1>>## $		 3 3FE BI  &'9!=>##r   c                    | j                   D ]&  \  }}}| j                  |   j                  ||f       ( | j                  | _         g | _        | xj                  dz  c_        y NrN   )r!  r%  r  r   r&  )r   r+  r-  r.  s       r   on_loop_endz.CooperativeReductionWorkspaceCache.on_loop_end  s_    *.// 	F&FGY  (//)0DE	F++1r   c                H    | j                   }| xj                   dz  c_         |S r1  )r'  )r   priors     r   increment_store_countz8CooperativeReductionWorkspaceCache.increment_store_count  s#      Ar   N)r+  r   )r   r   r   r   r`  r/  r2  r5  r   r   r   r  r  x  s    
$r   r  c                  $    e Zd ZU ded<   d Zd Zy)FixedTritonConfigzdict[str, int]r"   c                     | j                   |   S r   r"   r   r  s     r   r  zFixedTritonConfig.__getitem__  s    {{4  r   c                    || j                   v S r   r9  r:  s     r   __contains__zFixedTritonConfig.__contains__  s    t{{""r   N)r   r   r   r   r  r<  r   r   r   r7  r7    s    !#r   r7  c                      e Zd ZdZddZy)	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    c                Z    t         j                  j                  x}r||j                  fS |S r   )rL   r   
_load_maskr   )r   r  r  s      r   augment_keyzTritonCSE.augment_key  s,    88&&&4&tyy))r   N)r  r   r   zUnion[str, tuple[str, str]])r   r   r   r   rA  r   r   r   r>  r>    s    
r   r>  c                  d    e Zd ZU dZded<   ded<   ded<   ded<   d	 Z	 	 dd
Z	 	 	 	 ddZddZy)TMACompatibilityCheckerzO
    Checks if the TMA API can be used for load / store triton operations.
    TritonKernelr   rh  rI  r   r  forcec                    d| _         y )Nz2Cannot use TMA descriptor for load / store since: )failed_debug_prefixr   s    r   r  z%TMACompatibilityChecker.__post_init__  s
    #W r   c                   | j                   ryt        j                  j                         j                  dk(  rXt
        j                  j                         d   dk\  r4t        j                  j                  rt        j                  r
t               s!t        j                  d| j                         y| j                   r7| j"                  j$                  r!t        j                  d| j                         yy)NTr  r   r  z}%s Requires triton>=3.4.0, a CUDA device with cc>=9.0 and `use_tensor_descriptor` and `assume_aligned_inputs` options enabledFz/%s stores with `no_x_dim` cannot load 16 bytes.)rE  rL   rA  get_current_device_or_throwr  r   r  r  r"   r   use_tensor_descriptorassume_aligned_inputsr   rk  debugrG  r  r   rG  r   s    r   can_use_tmaz#TMACompatibilityChecker.can_use_tma  s     ::GG//166&@

00215:33,,)+ II[ ((  >>dkk22IIA(( r   c           
     
   | j                   rA|j                  D cg c]+  }t        j                  j                  j                  |      - }}n|j                  }t        j                  j                  j                  |d   t        j                  d            s"t        j                  d| j                  |       y| j                  j                  }|dd D ]  }t        j                  j                  j                  t        ||z  dt        j                  d            t        j                  d            rbt        j                  d| j                  ||        y |j                  d   }t        j                  j                  j                  |t        j                  d            r,t        j                  d	| j                  |j                         yd}d}|j                   D ])  }	t"        j$                  D ]  }
t'        |	|
      s|	}|
} ) + |r|sJ | d
t"        j$                          | j(                  j*                  r| j,                  st.        |   }d}| j(                  j0                  D ],  }|j2                  s|j4                  |k(  s |j6                  } n |J | j(                  j9                  |      }|j;                  ||i      |z  }t        j                  j                  j=                  |t        j                  d            s-t        j                  d| j                  |j                  |       yy	 	 d	 	 	 	 	 	 	 dd}||z  dz
  }|j?                  t@        |      j?                  t        |      }tC        tE        t        jF                  ||d                  }|| j(                  jI                  t.        |         kD  r#t        j                  d| j                  ||       y| j(                  jK                  |      }	| j(                  jL                  rW|| j(                  jL                  |	   kD  rt        j                  d| j                  |	| j(                  jL                  |	   |       ytO        || j(                  jP                  jS                  |	d            | j(                  jP                  |	<   yc c}w # tT        $ r. t        j                  d| j                  |j                         Y yw xY w)z
        Check if the block parameters are valid for TMA.
        If force, we allow relying on symbolic hints equivalent
        to what we check for Triton templates.
        r<  rN   z=%s TMA API requires innermost stride to be 1. Strides are: %sFN   r   zU%s TMA API requires outer strides to be 16 byte aligned. Dtype bytes: %d, strides: %sz>%s innermost block shape cannot load 16 bytes. Block shape: %sz, expr must contain a single block type from zj%s persistent reduction innermost block shape cannot load 16 bytes. Block shape: %s, persistent RBLOCK: %dc                    | |z  }|r||z  }|S r   r   )r?  r  r~  r  s       r   indexing_div_repzQTMACompatibilityChecker.are_block_parameters_compatible.<locals>.indexing_div_repQ	  s    
 a%C!AgJr   zC%s the minimum block size to satisfy expression %s is too large: %dzT%s For block %s, fixed config block size %d is smaller than the minimum required: %dz?%s innermost block shape cannot load 16 bytes. Block params: %sTr   )r?  r   r  r   r~  zOptional[sympy.Expr]r   r   )+rE  r2  rL   rA  r:  symbolic_hintrB  r   r   rk  rL  rG  rI  rQ  r   r0  r   r   r   r   r   persistent_reductionr  r   rV  rM  r   numel_get_persistent_RBLOCKsubsstatically_known_geqreplacer   r4   r  nsolve	max_blockr  fixed_configrz   tma_min_block_sizesr`  rB  )r   block_paramsstr2  element_sizerZ  innermost_block_shapeinnermost_block_typeinnermost_block_symtblock_type_str
block_symtinnermost_tree_prefix
tree_numelrp  persistent_rblockinnermost_block_bytesrQ  
solve_exprsolve_expr_simplifiedmin_block_sizes                       r   are_block_parameters_compatiblez7TMACompatibilityChecker.are_block_parameters_compatible  s    ::=I=Q=Q79  ..r2G  #**G ww77U]]STEUVIIO((
 zz**crl 	F77##;; 5q%--:KLa  		k,, 	 	" !- 8 8 < 7733!5==#3
 IIP((((
 ##3@@ 	N+77 
!.*=+9(+5(		 $(< 	
$%%QR_RkRkQlm	
<
 ;;++DNN %//C$D!J[[,, >>xx#88%&WW
	
 ))) $ B B: N%**,@BS+TU " 77##88%u}}R'8 		 A,, ,,%	 T IB
 /3!! ,  	 3\ABF
(2(:(:.)'/+;< & "110" "DKK$9$934%  II]00-&	 !!%!9!9:N!O;;++%(@(@(PP		< 44* KK44^D*  % GJ&77;;NANGDKK33NC Gv  		U,, ,,
 s&   0SB/S 3BS ;AS 4S?>S?c                    | j                   S )aH  
        Can you lift the make_tensor_descriptor
        call to the top of the kernel? This requires
        being certain that all of the shape, stride,
        and block_shape information is handled in arguments
        or top level definitions.

        Right now we assume this is always possible if you force TMA.
        )rE  r   s    r   r-  z TMACompatibilityChecker.can_lift	  s     zzr   Nr  )r]  r   r   r   )	r   r   r   r   r   r  rM  rl  r-  r   r   r   rC  rC    sS     OKX!	!Fm%m 
m^
r   rC  c                      e Zd ZU dZeZded<   eZded<   dZ	e
ZdZded	<   	 	 	 	 dU	 	 	 	 	 	 	 dV fd
ZedWd       ZedWd       ZdXdZdWdZd Zd Zd Zd ZdWdZd ZedYd       Zdddddd	 	 	 	 	 dZdZ	 d[	 	 	 	 	 	 	 d\dZd[dZ	 	 	 	 	 	 	 	 d]dZd Zd Z d Z!	 	 	 	 d^dZ"d_d Z#	 d`	 	 	 	 	 	 	 	 	 dad!Z$dbd"Z%d# Z&dcd$Z'	 	 dd	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ded%Z(dYd&Z)dfd'Z*	 	 	 	 	 	 dgd(Z+	 	 	 	 	 	 	 	 	 	 dhd)Z,	 	 did*Z-did+Z.d, Z/d- Z0d. Z1d/ Z2	 	 djd0Z3	 	 	 	 	 	 dkd1Z4	 	 	 	 	 	 dld2Z5	 	 	 	 	 	 	 	 dmd3Z6	 	 	 	 	 	 	 	 	 	 dnd4Z7dod5Z8d6 Z9dpd7Z:dqd8Z;d9 Z<d: Z=ed;        Z>d`dYd<Z?ed=        Z@ed>        ZAd? ZBdrd@ZCdA ZD	 ds	 	 	 	 	 dtdBZEdbdCZFdudDZGdvdEZHdwdFZI	 	 	 	 	 	 dxdGZJdwdHZKdydIZLdzdJZMd{dKZNdWdLZOd|dMZPeQdpdN       ZRd}dOZSd~dPZTeQddQ       ZUddRZVd}dSZW	 	 	 	 	 	 ddTZX xZYS )rD  zdA class to represent a triton kernel and helpers to generate
    triton kernel programmatically
    r  helper_functionszCallable[[sympy.Expr], str]kexprTNzOptional[bool]3transpose_discontiguous_tensor_descriptors_overridec                   || _         || _        t        |   |fi | t	        | j
                  | j                        | _        i | _        t               | _
        t               | _        t               | _        t        t                  | _        || _        t#        j$                         | _        t)        t*        t*        f          | _        t/               | _        t3        j4                  t(              | _        t)        t*        t8        f          | _        || _        t3        j>                         | _         d| _!        t        tD                  | _#        d | _$        | jJ                  r| jM                  | jN                         | jP                  r| jS                          | jU                          | jP                  r| jW                          d| _,        g | _-        y )Nr   F).optimize_maskr[  r_  r`  r>  newvar_prefixsuffixr   prologue_cacherV   prologuepost_loop_combinepost_loop_storer   r	   outside_loop_varsr  r}  countblock_ptr_iddictr   block_ptr_to_bufferr  ro  r"  r#  pointer_advancementsr  r\  hint_overrideCounter_load_counts_load_indexr/   autotune_hintstriton_metarJ  codegen_reduction_numelsr  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_maskhas_load_with_contiguous_rdimstores_with_contiguous_rdim)r   tilingr  rs  r[  r  re  rb  s          r   r`  zTritonKernel.__init__	  sr    $1(*6*T//=.0(6(81?1A/=/?!+C!2#6 %OO-#'S>#3  / 1##D) 	! $(S>#3 *6A6I6I6K )6859  ))$))4%%++-!%%002-2*68(r   c                   t         j                  st         j                  j                  sy| j                  }|D cg c]  }t        |t        j                        r|  }}t        |      dk(  ry	 t        j                  j                  j                  | ||      }t        d |D              S c c}w # t        $ r Y yw xY w)NFr   c              3  &   K   | ]	  }|d k(    ywr  r   )r   rZ  s     r   r  z4TritonKernel._has_stride1_on_rdim.<locals>.<genexpr>	  s     966Q;9   )r"   deterministicr  force_filter_reduction_configsr   r   r   r   r   rL   rA  r:  stride_varsZeroDivisionErrorry   )r   support_varsr   reduce_varsr  s        r   _has_stride1_on_rdimz!TritonKernel._has_stride1_on_rdim	  s       F$7$7$V$V)) $
c=#@#@A 
 
 {q 	''**66uk<XK 9[999!
 ! 		s   #B.1+B3 3	B?>B?c                <    t        d | j                  D               S )Nc              3  2   K   | ]  }t        |        y wr   )rX   )r   r   s     r   r  z>TritonKernel.has_store_with_contiguous_rdim.<locals>.<genexpr>	  s      
(,d#
   )rC  r  r   s    r   has_store_with_contiguous_rdimz+TritonKernel.has_store_with_contiguous_rdim	  s'     
040P0P
 
 
 	
r   c                    t        |      S r   )rF   )r   rI  s     r   dtype_to_strzTritonKernel.dtype_to_str
  s    5!!r   c                p    | j                   xr) t        j                  j                  | j                        S r   )rJ  rL   choices should_use_cooperative_reductionrL  r   s    r   r  z-TritonKernel.should_use_cooperative_reduction
  s-    $$ 
)S)SMM*
 	
r   c                     j                   sJ  j                  D ]$  }|j                  |xj                  dz  c_        &  j                  d   } j                  rt        | j                  d         } j                  j                  |       _        t         j                         _
         j                  j                  d       t         fd j                  D              r j                  j                  d       yy)z/One time setup code for cooperative reductions.NrN   r?  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c              3  Z   K   | ]"  }|j                   rj                  |        $ y wr   )rM  r  )r   r   r   s     r   r  z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>&
  s0      
   ''--
s   (+z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  rV  grid_dimrK  r[  r   r  
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher  r   ry   r   )r   r   	sem_counts   `  r   r  z'TritonKernel.init_cooperative_reduction

  s    )))) $$ 	#D}}("	# KK$		4+<+<X+FGI#yy33I>5WII6
2 					
  
((
 

 IIP
r   c                   d}| j                   s| d}| j                  j                  d|        | j                         r| j                  j	                  d       y | j                   rJ | j                  j                  d       y )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))rG  r  r   _has_constant_xmaskr   )r   rsplit_aranges     r   r  z,TritonKernel.init_cooperative_reduction_mask/
  s{    >}},oY7M		.}o>?##%II }}$$IIer   c                2   | j                   D ]q  }|j                  s| j                  || j                         ,| j                  s9| j                  j                  |j                   d| j                  |              s | j                  rt        d | j                   D              rS| j                  ddd      }| j                  |      }| j                  j                  d| j                  |              y | j                  | j                         y y )Nzbase = c              3  4   K   | ]  }|j                     y wr   )is_loopr   r   s     r   r  z2TritonKernel.codegen_range_tree.<locals>.<genexpr>Q
  s     =D4<<=s   baseTr   zrbase = )rV  r  iteration_ranges_codegen_headerr  rJ  r   r   iteration_ranges_ranges_codery   _get_reduction_symbols_flatten_reduction_indicesr   r  codegen_reduction_indices)r   r   rn_basesrbases       r   r  zTritonKernel.codegen_range_treeD
  s    $$ 		D<<44T499E&& 		##{{m74+L+LT+R*ST		   =D,<,<==66Dd 7  77A		  8D,=,=e,D+E!FG ..tyy9 !r   c                     y)z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Tr   r   s    r   need_numel_argszTritonKernel.need_numel_args\
  s     r   c                    | j                   xr4 t        j                  j                  | j                  | j
                        S r   )rJ  rL   r  should_use_persistent_reductionrL  r  r   s    r   r  z,TritonKernel.should_use_persistent_reductionf
  s5    $$ 
)R)RMM455*
 	
r   c                    | j                   xrG t        | j                        | j                  dz   k(  xr  | j                  xr | j                  d   dk(  S )NrN   r   )rS  r   rK  rI  r[  r   s    r   want_no_x_dimzTritonKernel.want_no_x_dimk
  sY    %% 1DKK D$;$;a$??1!!1 !!(+q0		
r   c                     y)Nztl.device_assertr   r   s    r   assert_functionzTritonKernel.assert_functions
  s    !r   F)
copy_shapedense_indexingoverride_maskr  r  c          
        !"#$  j                        j                  }d}t               !t        |t	        j
                  d            D ]y  }	t        |	t        j                        sJ |xs t        |	t        j                        }|rAt        |	t        j                        r? j                  j                  |	j                      }
!j#                  |
j$                         t        |	t        j&                  t        j(                  t        j*                  t        j,                  t        j.                  t        j0                  f      rt        j2                  D cg c]  }t        |	|      r	t4        |    }}t7        |      dk(  r	 t7        |      dk(  sJ d|	j                           !j9                  |d    d       | t:        j<                  j>                  xs |xs  j@                  duxr dk7  }d	}d}t               } jC                         D ]@  }|jE                  |jF                        rd	}nd}|j9                  |jH                   d       B |r& jJ                  rt:        j<                  jL                  srjO                         rz|sx j@                  slt7        !|z
        dk(  r[ jQ                        sJ|rH jR                  d
k(  r9	 	 	 	 	 	 dd"	 	 	 	 	 	 d fd$	 	 	 	 	 	 d"$fd#d!# fd} |       }||S d}d} jU                        } fd}tW              rst7         jY                               dk(  r |       \  }}nLt[        dgt7         jY                               z        }t]        dgt7         jY                               z        }d| d| d} j^                  r ja                         st        dg      !n
t               ! j@                  r!j9                   j@                         tc        |!|||      S |r|sˉ jd                  r jf                  r!ji                         } j@                  r|j9                   j@                         t        g d      }|jk                  |      sq|jm                  |      }|jo                         }t        |tp              sJ |js                  |       |j#                  |j$                         |jk                  |      sqdgt7         jY                               z  }|D ]y  }t        |tZ              sJ  jC                         D ]R  }|ju                  |jH                        s|jv                  }t        |tx              sJ  jY                         |   ||<   T { ddj{                  t}        tZ        |            z   dz   }t]        |      }d| d| d}n/ |       \  }}d| d| d}|!n|sr |       \  }}d| d| d}|!||s|r |       \  } }nd}|rt        |g      ! j@                  r!j9                   j@                          j                  !       tc        |!|||      S c c}w ) zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fr   r  r   rN   r   r  NTtl.int32c                    t        j                  | |j                               }|yt        |j                  gt
        j                  |      g|gt
        j                  |      g      S )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r0  r2  r5  )rO   match_affine_block_exprsymbolr   rT  r   r   r   )r   
range_treerZ  s      r   match_affine_blockz1TritonKernel.indexing.<locals>.match_affine_block
  sl     -DD:,,. >&%++,!.!=!=j!I J#H*;;JGH	 r   c                   |j                         }t        j                  dt        j                  t        j
                  |g            \  }}t        dt        |j                        | j                  t        ||            | j                  t        |||            z         }t        j                  | ||j                  |      }|y|\  }}}	t        j                  |      }
t         j"                  j$                  j'                  |j(                        t+        fd|
D              ryt,        j/                  |      }t1        ||
d         gt3        |
dd |dd       D cg c]%  \  }}t        j4                  t1        ||      |      ' c}}z   }|	D cg c]#  }t7        ||t,        j9                  |      i      % }}t;        ||||	      S c c}}w c c}w )
a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)r   r!   Nc              3  l   K   | ]+  }j                  |       xr j                  |        - y wr   )ro  statically_known_power_of_2)r   rT  rZ  r:  s     r   r  zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>%  sH        !==eYOO H$@@GGHs   14r   rN   r  )r  r   symbolsr  r  Wildrz   r   r  r{  r   r   rO   match_mod_div_block_exprrT  get_slice_numelsrL   rA  r:  rZ  r   ry   r   r   r   r  MinrE   r   r   )r   r  r  denommodulonum_dimsmatch_resultdimsr2  block_index_exprsslice_numelslinear_block_sizerT  rX  r0  r   r   rZ  r:  r   s                    @@r   match_mod_div_blockz2TritonKernel.indexing.<locals>.match_mod_div_block
  s   ( '--/	 !&"!))%**ykJ!v  
(()HY$>?++oi&OPQ	  3KK9j&6&6   ' !	%2CCDI 77++ NN:+<+<=	  ". 
   %2$@$@$L!-|A?1 '*,qr*:DH&E"s IIg&7?E1 !2	3  y-*H*H*TU3 3 ' +#)	 3s   ,*G (Gc                6    fD ]  } || |      }||c S  y)ze
                Match a block indexing subexpression involving a single range tree.
                Nr   )r   r  
match_funcmatchr  r  s       r   match_block_subexprz2TritonKernel.indexing.<locals>.match_block_subexprH  s:     ''# %J 'tZ8E($% r   c            	        t        j                  j                         D  ci c]  \  } }| |j                   c}}       }j	                         }|D cg c]&  }t        j                  ||j                               ( }}t        d |D              }t               }t        ||      D ]@  \  }}t        |j                  |j                              dkD  r y  ||      }	|	 y ||	z  }B |t        |      z
  }
j                         t         j"                  j$                  rt&        nt(        }t         j"                  j$                  rd}t        j*                  }nt-        t.              j1                         }j2                  	 j2                  }nt         j"                  j4                  }t7        d      r|d uz  }|rt        j8                  nt        j*                  }|j;                  ||
|j<                  ||      }|t(        k(  r,t-        t.              j?                  |j@                        sy |S c c}} w c c}w )Nc              3  <   K   | ]  }|j                           y wr   )r  r  s     r   r  zBTritonKernel.indexing.<locals>.match_block_expr.<locals>.<genexpr>i  s     *QT4;;=*Q   rN   Ftemplate_out_shape)r!  r"  rV  r   rW  r-  r=  )!rE   range_tree_nodesr|  r   r   rO   get_subexpr_involving_symbolr  r   r   r  r   intersectionr   sumfilter_masksr"   r   use_block_ptrr  r  r  r
   rC  r-  rq  )transpose_discontiguous_tensor_descriptorr   r  r]  rZ  rl  r!  )r  rp  index_relative_to_xyr_indexrV  r   index_subexprsrange_symbolsr]  subexprr!  r   options_classr-  r=  transpose_contiguousoptionsr  r   r   r  r   r  s                   r   match_block_exprz/TritonKernel.indexing.<locals>.match_block_exprX  s`   .8$2G2G2M2M2OP$!QAqvvIP/+ #557 !,	"  (DD3T[[]" " !+*Q[*Q Q.0%(n%E 
+MD' =55g6J6JKLqP# 1$?F~# F*L
+ 5s>7JJ !!), }}22 $0  ==..$H(7(L(L%04/1J1-  9AACH PP#$ !TT -
 #MMSS - t%9:,
$0FF, 0 (CC,AA & (..'$* +'"&..%&7 /  !$;;04/1J1- 5TT  ${ Q"s   I+I
c                      r7t         t              r  dd fS ddj                  d  D              z   dz    fS j                         t	        j                               fS )Nz.shaper  r  c              3  2   K   | ]  }t        |        y wr   r   )r   r
  s     r   r  zATritonKernel.indexing.<locals>._get_expand_str.<locals>.<genexpr>  s     *Fa3q6*Fr  r  )r   r   r  dense_size_strr   r  )r  r   s   r   _get_expand_strz.TritonKernel.indexing.<locals>._get_expand_str  sh    j#.(\0$66*F:*F!FFLjXX**,eD4H4H4J.KKKr   r  r  z, tl.int32)xmask)r   )r  ymaskzmaskr0_maskr   r  ,r  r  r  r   )r   r   r  rc   r   Optional[BlockParameters])r   r   r  rc   r   r  )r   z Optional[BlockDescriptorOptions])@prepare_indexingr   r   r  operator
attrgetterr   r   r   r   r   r   r   r   r   r   r   rd  r   r   r   r   r   r   r   r   r   r   r  r"   r   r  r@  r   r  var_listr   allow_block_ptrr  rM  is_indirect_indexingr  r  r   r  r   r   r[  r  r   rJ  r  copyissubset
differencerH  r\  r  r
  r   r  r  r  r  )%r   r   r  r  r  r  r  
index_varsr  r   r   r   prefix_matches
need_dense
have_densehave_loop_varsdense_mask_varsr   r  r  r   r   r   r  
mask_shapexyzr	tmp_masksr  expand_listr  rX  expand_shape_strr  r   r  r  r  s%   ```   `                          @@@@r   r  zTritonKernel.indexingw
  sY    %%e,''

%/\	*(*=*=f*EF !	:Cc5<<000# ~]22(J TXX.((..sxx8  !2!23%%II))JJJJ''
 
 !. 9 9"%c40 t$" "
 ~&!+>*a/N3CCHH:1NN/!2 3489C!	:H MM(( ++d* qj	 	 
+5<++- 	6D&&t}}5!%"
4;;-t 45	6 t338S8S-1==? "OOI/0A5--e4  J.!/B*,b!b/Bb*bH .A* _ _D '(G"
'+%%e,		L !' S!5!5!78A=+:+<(
L !s4+?+?+A'B!BC
$aS3t/C/C/E+F%FG":,b;GI  )A)A)C&y1	&L	doo.")  j$$)>)>. '^^-
??NN4??3!"HI$--d3 * 5 5d ;I#--/C%c+<===&&s+%%cmm4 %--d3  #ec$*>*>*@&AA& KD%dC000 $ 7 7 9 K??4;;7"&//C#-c3#77#7/3/C/C/Ec/JK,	KK !388C[,A#BBSH
$[1.ykJ<qI	+:+<(
L.ykJ<qI	+	J-<->*l*9+R8H7IKI'IZ"1"3<!"M?3I??MM$//*)$%
 	
e"s   &Y<c                   |j                         }t        |t              r|r&|r$|dk(  sJ d}n|sd}n|r|dk(  sJ d|d}nd|}| j                  r)| j                  d   j
                  r|j                         s|j                  r|j                  r!|| j                  v r| j                  |   }||fS |j                  |d      }| j                  j                  |      }|rt        |      |fS t        | j                        }	t        |t              rd|	 }nd	|	 }| j                  j!                  |t"        j$                  g 
      }
| j                  j'                  ||
       t)        || d|       }|j                  r+| j*                  j-                  |       || j                  |<   n| j.                  j-                  |       t        |t              rn|| j0                  |<   t2        j4                  D ]L  }|j7                  |      }t9        d |D              r'| j:                  |   }||vsJ d| d| d       |||<   N ||fS |j                  |      }||fS )a  Generate a block pointer or tensor descriptor for Triton kernel operations.

        This method creates either a block pointer (for regular Triton operations) or
        a tensor descriptor (for TMA operations) based on the indexing type. It handles
        caching and reuse of descriptors for performance optimization.

        Args:
            name: The name of the buffer/tensor being accessed
            var: The variable name for the pointer
            indexing: Block pointer options or tensor descriptor options containing
                     indexing information and boundary check settings
            other: Additional parameters string (e.g., padding options)

        Returns:
            A tuple containing:
            - block_descriptor: The generated block pointer or tensor descriptor variable name
            - other: Modified additional parameters string with boundary check options
        , other=0.0r   , boundary_check=z, padding_option='zero'r<  F)rc  r  tma_descriptorr   = c              3     K   | ]A  }t         j                  j                  j                  |t	        j
                  d              C ywr   N)rL   rA  r:  rB  r   r   )r   r   s     r   r  z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>  s=       !' GG,,DD &a(8s   AA	z#duplicate advancement for pointer 'z' at type 'r  )ru  r   r  rJ  rV  r  r  r-  rv  r  r   r  r   nextr|  r  namedvarr   uint64r  rU   rw  r   r  r~  r   r   r  rC  r  )r   r   r   r  r  checkblock_descriptorblock_ptr_line	block_varblock_descriptor_id	named_var	line_bodyr   advance_offsetsadvancementss                  r   codegen_block_ptrzTritonKernel.codegen_block_ptr=  s   2 '')h 78 ------+E94KL+E95 !!  $,,##%  SD,?,?%? $(#6#6s#; h  &&e "*e!D HH,,^<	 y>500&*4+<+<&=#h8)23F2G'H$)78K7L'M$ HH--$ELL . 	 ^Y7(2B1C3~FV/WX	$$MM++I6/?D'',II''	2h8 BFD,,-=> !. = = I*2*B*B4*H   +:	  %'+'@'@'F/|C ABRASS^_c^ddefC :I%56!I&  &&  (s3&&r   c                N   d| d|j                    d}t        t        |j                   |j                              D ]B  \  }\  }}t        j
                  j                  j                  ||      s4d|j                  |<   D |j                  ||j                   |j                  dd      }| dt        t        j
                  j                  |             d}t        |t              rd| d| | dS | d	t        j                  j!                  |j"                         d| dS )
Nr  r  r  FTr  r  r{  	tl.store(z.store()r)  r  r  r&  rL   rA  r:  rB  r(  r  r0  rM  	get_dtyper   r  r   r  r5  )	r   r   r  r  r  r  rs  rX  broadcast_dims	            r   codegen_block_ptr_store_linez)TritonKernel.codegen_block_ptr_store_line  s0    #5'H,@,@+AC *3$$h&>&>?*
 	8%C%#} ww77]K27**3/		8 66      7 
 '/0A0A$0GHIKh0ykE75';;GAHH$9$9(:J:J$K#LBugUVWWr   c                   |s|sy t        |t        j                        sJ | j                  |dd       }t        |t              sJ |j
                  }|j                         r|j                  nd }|rt        | j                  |            nd }| j                  ||rdnd ||      }	| j                  |      }
| j                  j                  |
|	dt        j                         y )NFr  0)
assignmentrI  )r   r   r   r  r   r   r  r  texprrename_indexingindirect_assertget_load_bufferr   r  r   rR  )r   r   r  lowerr   r  r   r  size_strlinebuffers              r   check_boundszTritonKernel.check_bounds  s     $

+++==RV=W(O444&&	(0(9(9(;8$$8=5--d344 ##esx
 %%h/&$5Lr   c                    |j                         s|j                         r| j                  S | j                  r5| j                  d   j
                  r|j                         s| j                  S | j                  S )Nr<  )	r  r  r  rJ  rV  r  r  r  loads)r   r  s     r   r3  zTritonKernel.get_load_buffer  sb      "h&:&:&<<<!!  $,,'') 99::r   c                    d}| xj                   dz  c_         | j                  r| j                  }t               r"| j                   dk(  r|j	                  |       y y y )Nztl.extra.cuda.gdc_wait()rN   )r  rJ  r  r  r   )r   wait_bufferGDC_WAITs      r   _handle_pdl_before_loadz$TritonKernel._handle_pdl_before_load  sV    -A  ))K1$%%h/ %  r   c                     d} j                   r j                  }t               rO j                  t	         fdd| d|       } j
                  j                  ||t        j                         y y )Nz%tl.extra.cuda.gdc_launch_dependents()c                 "     j                   k(  S r   )r  )current_load_indexr   s   r   <lambda>z5TritonKernel._handle_pdl_after_load.<locals>.<lambda>  s    *d.>.>> r   z0; z # gdc launch for rH  )	rJ  rx  r  r  r;   r   r  r   rR  )r   launch_buffer
result_var
GDC_LAUNCHlaunch_if_last_loadrA  s   `    @r   _handle_pdl_after_loadz#TritonKernel._handle_pdl_after_load  sr    <
   22M!%!1!1"0>j\!3J<@# HHm-@T  r   c                P    | j                   j                  t        |||             y r   )saved_partial_accumulater  rd   )r   r   r   r  r  s        r   r  zTritonKernel.partial_accumulate   s$     	%%,,dNC8	
r   c           
        | j                   j                        }| j                  xx   dz  cc<   t        }| j	                  |      |}t
        j                  j                        }| j                  |d| j                  | |dd            }t        |t              r"| j                  |j                        rd| _        |j                         |j!                         }t#        d | j%                  |      j'                         D              }	| j)                  |      rd}
nX|	sd}
nS| j*                  rE| j,                  d   j.                  r,fd	}   d
}
t1        j2                  t4        d|      }nd}
|sr8|j7                         r(| j8                  rdt;        | j8                         }nd}nd}	 d}t<        j>                  j@                  r"| jB                  jE                         }|   dkD  }	 | j)                  |       xr | j*                   xr | xr |	}d}|rd}d}d}tG              re|}|tH        jJ                  tH        jL                  fv r=t<        j>                  jN                  rtH        jP                  }n|dtS        |       dz  }d}nt        |tT        tV        f      r| jY                  |||      \  }}t        |tT              rd| | |
 | d}n/| dt
        jZ                  j]                  |j^                         d}|ja                  ||jb                  |jd                  dd      }|jd                  }ntg        |      rd| d| d}|jh                  }d}n^d| d|jj                   d|jl                   |
 | | d
}|jn                  r|jn                  }ntp        js                  |j                        }|tH        jJ                  tH        jL                  fv r/t<        j>                  jN                  r|dz  }tH        jP                  }|tH        jt                  k(  r/tH        jv                  jx                  |dz  }tH        jt                  }| j{                  |      }| j}                  |       | j~                  j                  | ||      ||      }| j                  ||       |j                  dkD  rxx   dz  cc<   t        |t              sJ |j                  |_D        |rd| d| d}| j~                  j                  ||||jn                        }|j                  r|j                  rd}n|tH        jt                  k(  rd }nd!}| j8                  rt;        | j8                        n|}d"|jl                   d| d| d}| j~                  j                  ||||j                        }| j*                  r|j                         ss| j                  j                  |       |S )#zc
        Load from the memory location 'name', offset by some indexing expression 'index'.
        rN   TFr  rE  r  c              3  &   K   | ]	  }|d k(    ywr  r   )r   r  s     r   r  z$TritonKernel.load.<locals>.<genexpr>/  s      
AF
r  z, eviction_policy='evict_last'r<  c                          kD  rsryy)N
evict_lastevict_firstr   )expected_countr  indirect_indexingload_countsr   s   r   decide_laterz'TritonKernel.load.<locals>.decide_later8  s    t$~5"3'$r   z, eviction_policy='<EP>'z<EP>r   z, other=r  z, cache_modifier='.cg'Nr{  r  r   r  z.load(r(  r  r  rw  ru  r  r  r  r  z0.0Truer.  r  )Jr  r  r  r   r  rL   rA  r*  r  tma_compatibility_checker_clsr   r   r  r   r  r  r  ry   get_strides_of_loadr  is_broadcastedrJ  rV  r  r  r  r<   r  _load_otherr`   r"   r   skip_l1_cacherL  buffer_read_countsrj   r   rq  rr  rp  r   rF   r  r  r&  r   r  r5  r  r0  r)  r   r   r   r  r   r   r   r   r  r  r3  r>  r   r  rG  	use_countr\  r   rW  r   r  rz  r  )r   r   r   r   	make_lineoriginal_indexrI  r  r  is_coalescedeprS  r  has_read_depsrZ  rY  cachemodappend_broadcastr   r6  r  load_bufferrD  zero	other_valrP  r  rQ  rR  s    `                       @@@@r   r  zTritonKernel.load  s    iiood#''DQCK	 55e<!!$'==&*&H&H	 'I ' ! 	
 h0T5N5NNN6
 26D.((*
**,  
 44^DKKM
 
 ~.1B1B""t'7'7';'C'C% % ).N+B!))*:FLQIB:8+<+<+>"=1A1A#B"CD%E	 ==&&!%!A!A!C.t4q8M	 ##N33 )))!! 	 	 /H $#D)D 77==77!MMEd;u#5"6a88DE (_6M$NO*.*@*@#x+' % h8%&6%7wrd8*AND./vahh6K6KHL\L\6]5^^_`D==((((#'# >  !,,&~6!#d>*:"=#+#6#6 !#d8+=+=*>c(BSBSATUWTXY^X_`h_iijk (($11E)99(..IE %--88MM88))

"u}}'8'8'@ &

**84$$[1XX&&4U ' 

 	##K<!#"*&7888'11
%j\4D3EQGD**Th6K6K + J !!** Dejj(!DD7;7G7GM$"2"23T  #8#4#4"5R
|2i[PQR!XX..U*:J:J / 
 $$X-?-?-A*""&&z2r   c           	     p   | j                   j                  |      }|}t        j                  j	                  |      }d}||dk(  r|dk(  }	| j                  | |d|	      }| j                  |d|du |      }
t        |
t              r6| j                  |
j                        r| j                  j                  |       || j                   j                  v }| j                  |      }|r'|r%| j                  j!                  t#        |d             t        |
t$        t&        f      r-| j)                  |||
      \  }}| j+                  ||
|||      }n2||
j,                  }t/        |      rZ|j0                  Nt3        d |j0                  D              s2dj5                  t7        t8        |j0                              }|d	| d
z  }d| d| d| d|
j:                   d
	}n|dk(  rd| _        |
j,                  }t/        |      rZ|j0                  Nt3        d |j0                  D              s2dj5                  t7        t8        |j0                              }|d	| d
z  }d| d| d| d|
j:                   d	}nt?        d|       tA        jB                         }| jD                  s7| jF                  r+|jI                  | jK                  || j                               | j                  j!                  t#        ||             | jD                  s| jL                  jO                  |       |jQ                          y)zn
        store the 'value' to the memory location 'name', offset by some indexing expression 'index'.
        NtmaTrK  )r  r  r  ztl.debug_barrier()c              3  8   K   | ]  }t        |      d k(    ywr   Nr  r  s     r   r  z%TritonKernel.store.<locals>.<genexpr>       ?aCFcM?   r  .broadcast_to(r  r)  r  rw  
atomic_addc              3  8   K   | ]  }t        |      d k(    ywri  r  r  s     r   r  z%TritonKernel.store.<locals>.<genexpr>	  rj  rk  ztl.atomic_add(z, sem='relaxed')zstore mode=))r  r  rL   rA  r*  rU  r  r   r   r  r   r  r  inplace_buffersrW  storesr   rU   r  r  r&  r,  r   r   r   rC  r  r  r   r  atomic_add_foundr  
contextlib	ExitStackrJ  r  enter_contextguard_cooperative_storerz  r  close)r   r   r   r  moder   r]  rI  r  rE  r  
is_inplacerW  r  r  r6  indexing_strvalue_shape
exit_stacks                      r   storezTritonKernel.store  s    iit$!!$'$(!<45=EME(,(J(J	 )K )% ==dl&?	 ! 
 h0T5N5NNN6
 ,,33D9 TYY666
,,^<.KK!!,t5I"JKh2I JK&*&<&<T3&Q#e44h 0%D \ $--L%e,KK+?5;;??"iiC(=>.Q ??se4~Sr(BSBSATTUVD\!$(D!#--L%e,KK+?5;;??"iiC(=>.Q ??#C5\N#eWBxGXGXFYYijD%D6&:;;))+
$$)C)C$$T%A%A$%TUl467$$""&&u-r   c                Z    | j                   j                  d| dt        |       d       y )Nztl.device_assert(r  r  )r  r   repr)r   condmsgs      r   device_assert_asyncz TritonKernel.device_assert_async  s(    !24&49+QGHr   c                    | j                   j                         }|j                  t        |d| d             |j	                         S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  r5  r   rU   indent)r   r   r7  rs  s       r   ru  z$TritonKernel.guard_cooperative_store  sC    
 88NNPd.?uK,PQR}}r   c                t    d }|D ]0  }|t        |d      s||j                  }"||j                  z  }2 |S )Nr   )r   r   )r   	variablesmaskselems       r   _combine_maskszTritonKernel._combine_masks(  sK     	3D|t[)= NNE!DNN2E	3 r   c                :   | j                   j                  t        j                         | j                  j                  |d         }| j                  |d         }	| j                  |d         }
| j                  |d         }|r| j                  j                  |d         nd}|r| j                  |d         nd}|t        j                  k(  rd}n!|t        j                  k(  rd}nt        d      | j                  | j                         | j                  j                  | j                  d	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d||j                        }| j!                  | j                  |       | j#                  |||      }||_        |S )z3
        See [Note: Inductor bucketize op]
        r   rN   r!   r   r   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r  z, )r  )r  r  r/   ONE_ELEMENT_PER_THREADr  r  r  r   rR  r  r  r>  r  r   r  r   rG  r  r   )r   r  
boundariesboundary_indicesindexing_dtyper  sortersorter_indicesboundaries_ptrboundary_sizeboundaries_underlying_numelboundary_stride
sorter_ptrsorter_stridetriton_dtyper\  r  s                    r   	bucketizezTritonKernel.bucketize4  s   $ 	 C CDA7))*Q-8&*&7&7
1&F#++JqM:39TYY__VAY/v
8>))&)4FU[[(%Lu{{*%L%G  	$$T\\2""LL5fXRbr2M1NbQ`Paac nBgRl"]O2  !,, # 
 	##DLL&9##F,<nM r   c                    | j                         }|dk(  rd| dS | j                  }dg||z
  z  dg|z  z   }| ddj                  |       dS 	NrN   z!triton_helpers.promote_to_tensor(r  r  r   r  r  r  r   rI  r  )r   r  ndimsnreducesizess        r   reduction_resizezTritonKernel.reduction_resizem  sj    '')A:6ugQ??)))VHw,>>$))E*+1--r   c                    | j                         }|dk(  rd| d|fS | j                  }dg||z
  z  dg|z  z   }|g |d ||z
   dg|z  nd }| ddj                  |       d|fS r  r  )r   r  r   r  r  r  r  s          r   reduction_resize_and_shapez'TritonKernel.reduction_resize_and_shapev  s    '')A:6ugQ?FF)))VHw,>>=B=N9e'uw(9A3=9TX 	 $))E*+1-y88r   c                   | j                   dk(  r|S | j                         | j                   z
  }| j                         }|d| dgz   }| j                  j	                  |t        t        |      ||      |t        |            S )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rN   Nr  r  )rI  r   r  r   r  r  r   r   )r   r7  r  rI  target_ndimr  target_shapes          r   reduction_collapse_dimsz$TritonKernel.reduction_collapse_dims  s     ""a'L--/$2I2II,,.$\k2hZ?xx  3u:}lC%	 ! 
 	
r   c                   :;<=>?@A dEd}t        j                  |      D cg c]  }|j                   }}t        j                  ||      }t	        d |D              rHt        j                  t
        j                        t        j                  t
        j                         j                  sJ t        d  j                  D              } j                  |       t        |      } j                  r|j                   j                          j                  d   j                  d   }	 j                   rY j#                         }
t%        |
      dk\  sJ |
D cg c]  }d|v sd|v s| }}d	d
j'                  |       d;t)        |      @n) j+                         ;t)         j#                               @ j-                  ; @fd|      }d}dv rt/        |t(              r|\  }} j1                          j2                  z
  <	 	 	 	 	 	 dF< fd=	 	 	 	 	 	 	 	 dG=fd}<> fd}|f}| j4                  j6                  v r j4                  j6                  |   S t9              }t;              }t=         j#                               }d|<<    j4                  j?                  |t)        |            }t        d |D              |_         dj'                  |      ::fdA jB                  rqtD        jF                  jI                        }d ?dH ?Afd}dk(  rnwt/        |t(              r&tK        ||      D cg c]  \  }} |||       }}}nAdk(  r3 j4                  jM                   jN                  ||j                        }n	 |||      }dv rt/        tP              sJ tR        jT                  jW                         }|r"dtY        |       d j[                  |       d}nCtY         j4                  jM                   jN                  d |	 d!| d"||j\                              }d#d$d   > | jN                  |||       ||_        nd%k(  r8 j^                  r ja                  ||A|      }nm jc                  |      }nYd&k(  rMt/        td              sJ |\  }}}t)         fd' jg                   jN                  |||<      D              }ndk(  r ji                  |      }nt/        tP              sJ  = jN                  ||j                        \  }} }! j4                  jM                   jN                  || |!      }n j4                  jk                  d(| |t)         j#                                     }"tD        jF                  jm                        } j-                  tn        |      }t/        |t(              sdk(  r j#                         }
t%        |
      dk\  sJ |
D cg c]  }d|v sd|v s| }}t)        |      |"_.        d	d
j'                  |       d; jp                  js                  |" d); d
| d
| d       n5 jp                  js                  |" d) j+                          d
| d
| d       dv rd(| d*} jt                  jw                         }# jp                  js                  | d) j+                          d
t        jx                  |#      jz                   d
 j[                  |#       d       d#d$d   >|!dtY        |       d j[                  |#       dn|	 d+}$ jN                  j}                  d,|" d-| d.> d/|" d
| d
| d
|$ d0|" d1 A|" d2|"       d3| d1 A| d2|       d3        | j~                  ||"|       n4t              r ja                  ||A|      }ndk(  r3d(| d4}%d(| d5}& jp                  js                  |% d) j+                          d6| d        jp                  js                  |& d7 j+                          d
| d        jN                  j}                  d8|% d-|& d9|% d
|& d
| d
t        j                   d:        jN                  j}                  d8|% d1 A|% d2|%       d8|& d1 A|& d2|&       d8	       |}' j4                  j?                  |'j\                        }( j                   j~                  |'|(|%|&<      }ntE        j                        }) |)|"|      }*dk(  r! jN                  js                  |" d1|*        n' jN                  js                  |" d1 A|*|"              t
        j                  k(  rD j4                  jM                   j~                  |" d;t
        j                  |"j\                        }" | j~                  ||"d        j^                  rtD        jF                  jm                        }t        j                         }+ j~                   j                  fD ]2  },|,js                  d<       |+j                  |,j                                4 dv r j~                  js                  | d= j                  | d>               j                  | d?|      }- jt                  jw                         }# j                  ||#t        jx                  |#      jz                        }. | j                  ||-|.       nFt              rd%k(  sJ |\  }/}0}1 j                  |/t;              |d         }2 j                  |0t;              |d@         }3 j                  |1t;              |dA         }4 j                   j                  |/|0|1|2|3|4<	       ndk(  rw|\  }'}(t/        |td              sJ  j                  |'t;              |d         }5 j                  |(t;              |d@         }6 j                   j                  |'|(|5|6<       n1 j                  |t;              |      }7 | j                  ||7d       |+j                          | j4                  j6                  |<   t/        |t(              rt        dB |D              sJ  j                  j                  |       dCv rt%        |      d@k(  sJ t%        |      |z  }t%        |      t%        |      k(  sJ tK        ||      D ]F  \  }8}9|9J |8j                  |9k7  s j~                  js                  |8 d1|8 dDt        |9       d       H |S t/        |t              sJ  j                  j                  |       |j                  |d   k7  r7|d   J  j~                  js                  | d1| dDt        |d          d       |S c c}w c c}w c c}}w c c}w )IzS
        codegen reduction of value to Triton according the reduction_type
        c                    | j                   t        j                  t        j                  fv r$t	        j
                  | t        j                        S | S r   )rI  r   rq  rr  r  r  r   r  s    r   maybe_upcastz,TritonKernel.reduction.<locals>.maybe_upcast  sF     ;;MMNN UEMM2 r   c              3  `   K   | ]&  }|t         j                  t         j                  fv  ( y wr   )r   rq  rr  r  s     r   r  z)TritonKernel.reduction.<locals>.<genexpr>  s"     MqU]]ENN33Ms   ,.c              3  :   K   | ]  }|j                    d   ywr  Nr   r  s     r   r  z)TritonKernel.reduction.<locals>.<genexpr>       MDdkk]$/M   r<  r   r   XYr  r  r  c                x    j                   j                  j                  d|  d d| j                        S )Nr  r  r  r  )r   r  r  rI  )r  r  r   rz  s    r   rB  z(TritonKernel.reduction.<locals>.<lambda>  s?    dhh''"1#R'7q9gg!	 (  r   N)argminargmaxc                   t        	      }
j                  | |      }	dk(  rYt        
j                               dk(  }|j                  J |r| d}dg|j                  d}n>| d}g |j                  d}n(
j                  | d| d d|j                        \  }}|| d	
j                  |       d}n|j                  }|||fS )
zK
            Helper to generate a reduction call, e.g. tl.sum.
            r  rO  z[None,:,:,None]rN   z
[:,:,None]r  r  r  r{  )r   r  r   r  r   r  r  rI  )r7  r  result_typetriton_reduction_fnis_bmmr\  r   rX  rI  r   r   s          r   final_reductionz/TritonKernel.reduction.<locals>.final_reduction  s    #@"O00FE&T11349{{... %wo6F00a0E %wj1F-ekk-1-E $ ? ?*+1UG2cU!<ekk! &"84(9(9+(F'GqI#kk;--r   c                N     | ||      \  }}}| j                  | d|        y)zU
            Generate a reduction and assign it to an existing variable.
            r  N)r   )r7  rD  r  r  r  r  s        r   final_reduction_definez6TritonKernel.reduction.<locals>.final_reduction_define	  s0     *&%EKE1aMMZLE734r   c                    j                  | |      }j                  | |      }| j                  d| d| d d| d| d d| dj                  | d       d	       y )
N                z_val, z_idx = triton_helpers.z_with_index(r  )
                r  _idx
                )r  r   r  )r7  rD  r  r   rX  rI  root_opr   s       r   final_argreducez/TritonKernel.reduction.<locals>.final_argreduce  s    00FE00FEMMF:,.DWI\Z_Y``bchbiiklokp qC 5 5D6I JK Lr   r   r  c              3  >   K   | ]  }t        |d          r|  ywr  )rB   )r   r   s     r   r  z)TritonKernel.reduction.<locals>.<genexpr>*  s!      *
(;CF(CC*
s   r  c                :    s| S t         j                  | |      S r   )r  r  )tvalfvalr  s     r   r  z*TritonKernel.reduction.<locals>.where_cond/  s     (..tT4@@r   c                d   |j                   s|t        j                  k(  r| S ||k(  s| dk(  r| S | t        j                  |      j                  k(  rt        j                  |      j                  S | t        j                  |      j
                  k(  rt        j                  |      j
                  S | S )z7update reduction constant mask value to match dst_dtyper   )rW  r   r   iinforz   r{   )r  r  r  s      r   update_constant_dtypez5TritonKernel.reduction.<locals>.update_constant_dtype7  s    
 ..)uzz2I#O	)X]#Ou{{95999 ;;y1555Y!7!;!;; ;;y1555#Or   c                     || j                         }j                  t        |      }j                  j	                  j
                   | |      | j                   | j                        S )Nr  )rI  _map_tuple_or_scalarr`   r   r  r  r   )r  r  default_strr   r  r  r  s      r   _mask_valuez+TritonKernel.reduction.<locals>._mask_valueI  sb    /EKKP"77wOxx((LLuk2++++	 )  r   online_softmax_reducer  rH  )r  r  r  r  r  r  zindex, z.shape)rz   r{   welford_reducewelford_combinec              3  v   K   | ]0  \  }}j                   j                  j                  ||        2 yw)r  N)r   r  r  )r   r  r   rI  r   s      r   r  z)TritonKernel.reduction.<locals>.<genexpr>  s9      #$u HH%%dllEe%T#s   69r   = tl.full(_indexr   r  _next, z_next = triton_helpers.z%imum_with_index(
                    #
                )
                r  _nextr  _max_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = _val_bvalrN   r!   c              3  <   K   | ]  }t        |t                y wr   )r   r\  r  s     r   r  z)TritonKernel.reduction.<locals>.<genexpr>d  s     LAz!%67Lr  )r  r  r{  )r  rT   r   rT   )r  rT   r  r  r   z1tuple[str, Optional[torch.dtype], BlockShapeType])rD  rT   r  rT   r  r  r   r   r   rT   )Vpytreetree_leavesrI  tree_mapry   r   r  r   rJ  r   rV  r  r  r@  r  r   r  r  r   r  r   r  r  r   r   rI  r   reduction_cacherU  rS  rR  r  r   rS  r#   r  r  r  r  r  rT   rL   r   r  r   r  r   r  r  welford_reduce_fallbackr   _welford prepare_softmax_twopass_fallbackr  default_accumulatorr`   r  r   rL  select_index_dtyper  rz   r   rx  r@   r"   r  %online_softmax_reduce_final_reductionget_reduction_combine_fnr   rL  rr  rs  ry  rt  r  r  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionrv  rC  rz  rd  rJ  r\  r  )Br   rI  r  r   r  r  r  original_dtypesr  reduction_range_prefixdense_sizesr  xy_sizes_onlylogical_indexr  r  r  acc_typetorch_acc_typeresult_shaperD  r  r  r  dmasked_valueaccumulator_dtypeaccumulator_indexmeanm2weight_result_dtype_shapeaccumulatorr  r  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedr{  bufpeer_valpeer_idxresult_mean	result_m2result_weight	peer_meanpeer_m2peer_weightpeer_maxpeer_sumpeersr   
orig_dtyper  r  rX  r  r  r  rz  r  sB   ````                                                      @@@@@@@@r   	reductionzTritonKernel.reduction  s-   	 170B0B50IJ399JJe4M_MM++Iu}}EI''u}}=E$$$$MD<L<LMM% u??LL)!%!1!1"!5!<!<Q!?   ..0K{#q(((.9XdSD[CSWKTXMX =!9 :!<N.K!002N 4 4 67K )) 
 11%'',$}%%'$*A*AA	.	. /	. ?		. 	.B
	5#
	5 
	5 /	
	5
 
	5	 6	00088++I66"9-))4D0023S((// l(; * 

  * *
 *
  

 zz% 	A
 $$ll00KG$$	 	 !88 E5)>A%>QRdaAq 1RR5(
  $xx00uEKK0X*5':!55!,<<<$%HH$K$K$M! *+C,>+?uTEVEVWhEiDjjk(l%(+)) LL./E.Fgl^[bc"3"."4"4	 * )% &+e<^LLL*l<M $5
 #33--!%!4!4"NE:xQV"J "&!=!=eU!KJ#44!,999%1"r6" #(,dBU)# 
  #:: "BB5%P
!,<<<*9LL,0B0B+' "XX..LL'v / 
 ((++J< $D0023 , K
 ll66~yQG//wGGgu-!U*"&"6"6"8K{+q000)4%!%tsd{%M % ).m(<K%'(=)A(B!%DNII''&-{>2B"WIRPXzYZ[ II''&-{43F3F3H2IG9TVW_V``ab !55&'
|6$:!"mm>>@		##()T5H5H5J4K2{{;/334Bt7H7H7U6VVWY &+e<^L %0 M*+51B1B;1O0PPQR2359 
 ##W%6$77Nwi X M$5#6br) MS{m5,A;!O P Q"#3z5F4Gu2MO`'a&b c  **JEV &n5!00z8U
  #::$%j\"6$%j\"6 		##&'{43F3F3H2IIZ[cZddef 		##&'|D4G4G4I3J"XJVWX ##$%W_,= >()O+<BugRH\H\G] ^ ##$%S6Gu4M)_(` a$%S6Gu4M)_(` a (
!XX__5
@P@P_Q
!GG**##
  88S
$[%8!U*LL**k]#gY+GHLL**&-s:g{+K*LM 

* #'(("3"3..&-|4#jj)//	 #4 #K '**JT %%ll66~yQG#--/J..0D0DE 7./((67
 !55&&00!l(4+@+@J<tAT+U*VW  JJ!l%()W #mm>>@JJU[[-E-I-I   4 4j(HU%n5%)99998B5Y KK$Y/AJ	
 II$Y/AJ
 #MM!$Y/AJ
 33((!
  #::)3&
J!'8444JJ 0 ;WQZ  JJ 0 ;WQZ ::(( GG 0 ;W 't';';ZPTU.8  +j%(LLLLL""))*5 !LL?+q000"%j/O"Cz?c/&::::#&z?#C Z!---99
***44%s3%t,?
,K+LAN"  j*;<<<""&&z2 ?1#55&q)555&&00!l#j\6I/Z[J\6]5^^_` _ K4 Yb  SR%s#   w5&w:4w:w?x(xc                   | j                  |||      }| j                  |||      }t        d      D cg c]'  }t        | j                  j	                  |            ) c}\  }}|j                  d| d| d| d| d| dt        j                   d| d| j                  |        d| d| j                  |        d       ||fS c c}w )Nr!   rH  
            r  9 = triton_helpers.online_softmax_reduce(
                )
            r  )	r  rT  r   r   r  r   r"   r  r  )	r   r7  r  r  rX  rI  r  r  r  s	            r   _online_softmax_reducez#TritonKernel._online_softmax_reduce  s     66vPUV66vPUVMRSTX!V#dhhooEo&B"C!V
JL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%% "Ws   ,Cc           
          fd|||fD        \  }}}d| d| d| d d	}fd}|||fD 	cg c]/  }	 j                   j                   ||	j                              1 }
}	j                  dj	                  |
D cg c]  }t        |       c}       d|        t         fd|
D              S c c}	w c c}w )	z;
        Helper to codegen triton_helpers.welford.
        c              3  D   K   | ]  }j                  |        y wr   )r  )r   r  r7  rI  r   s     r   r  z(TritonKernel._welford.<locals>.<genexpr>  s(      
 ((>
s    ztriton_helpers.welford(r  r  c                2    t        | d | dz   d  z         S )Nr   rN   )r   )r   rX  s    r   reduced_shapez,TritonKernel._welford.<locals>.reduced_shape  s$    qcAgi(8899r   r  r  c              3  V   K   | ]   }j                  ||j                         " y wr   )r  r   )r   r  r   s     r   r  z(TritonKernel._welford.<locals>.<genexpr>  s*      
 ++E5;;?
s   &))r   r  r   r   r  r   r   )r   r7  r  r  r  rX  rI  welfordr  r  welford_resultsr  s   ``   ``     r   r  zTritonKernel._welford  s    
F+
b& ,D6B4r&C5J	:
 F+
 HHOO%}U[[/IOJ
 
 	DII&G!s1v&GHIWIVW 
(
 
 	

 'Hs   4B:B?c                   | j                         | j                  z
  }t        | dt        | j	                               |t        j                               }t        | dt        | j	                               |t        j                               }	t        | dt        | j	                               |t        j                               }
| j                  j                  | d| j                          d| d       | j                  j                  |	 d| j                          d| d       | j                  j                  |
 d| j                          d| d       |dk(  r>|\  }}}| j                  j                  d	| d
|	 d
|
 d| d|	 d|
 d| d| d| d       n8|dk(  sJ | j                  j                  d	| d
|	 d
|
 d| d| d|	 d|
 d       | j                  j                  d| d || d|       d|	 d ||	 d|	       d|
 d ||
 d|
       d       |}| j                  | j                  |dd||	|
||	      S )z%Helper to codegen a welford reduction_meanr   rI  ra  _m2_weightr  r  r  r  r  r  z<_next = triton_helpers.welford_combine(
                    z,
                    r  r  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r  r  r  N)r   rI  r\  r   r  r    unknownr  r   r  r  r   r  rx  )r   rD  r   r  r  r  rI  rX  r  accumulator_m2accumulator_weightr  r  r  r  s                  r   r  zTritonKernel.welford_reduce  s     %%'$*A*AA'l% ,,./&&(	
 +l#,,./&&(	
 /l'",,./&&(	
 			m<(;(;(='>b
!L	
 			l4+>+>+@*AH:QO	
 			!",t/B/B/D.ERzQRS	
 ..$D"fLLW^$4G<N;O P MN#326H5I JF"RD6( + "%5555LLW^$4G<N;O PG2k]"^,<B?Q>R S 	MZ;-u(={KL MC
n-=U+C^ TU V J2D1EU/KM_$`#a b	
 !22""

 
	
r   c
           
        t        | j                  ||||||	            }
|||g}t        t        ||
            D ]E  \  }\  }\  }}|"| j                  j                  |	|      }|||<   |j                  | d|        G t        |      S )z0Helper to codegen call to triton_helpers.welfordr  r  )rR  r  r  r  r   r  r   r   )r   r7  r  r  r  r  r  r  rX  rI  r  result_exprsr  result_exprr  r   s                   r   r  z+TritonKernel.welford_reduce_final_reduction  s     dmmFD"fc5IJ#Y>09#lF:S0T 	6,A,^eU""hhooEoG"-QMM[MUG45		6 \""r   c                   | j                  |||      }| j                  |||      }	|j                  d| d| d| d|	 d| dt        j                   d| d| j	                  |        d| d| j	                  |        d       ||fS )Nr  r  r  r  r  )r  r   r"   r  r  )
r   r7  r  r  r  r  rX  rI  r  r  s
             r   r  z2TritonKernel.online_softmax_reduce_final_reduction	  s     66vxO66vxOL:, ' !O#4Bse2f>R>R=S TLD11ZLBC DLD11ZLBC D		
 :%%r   c                D    | j                   r| j                   d   S t        S )NRSPLIT)r[  r2   r   s    r   
max_rsplitzTritonKernel.max_rsplit  s"    $$X..  r   c                   | j                   d   }| j                         sdnd}||j                  z  | j                         z  }| j                  j                  |      \  }}| j                  j                  d| d| d| j                  |       dt        |       d| d	| d
| dd       | j                  | dddg|t        j                               }	| j                  j                  |	 d| dt        |       d       |	S )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r?  zxindex < xnumelNr  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r  r  Tstrip_peersr   r)  r  z = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  )rK  r  rQ  r*  r  r/  rx  r   r  rF   create_cse_varr    r!  ry  r   r`   )
r   rD  rI  default_valxnumelr  r+  r-  r.  r  s
             r   r  z7TritonKernel.codegen_cooperative_reduction_peer_combine  sS    S!(,(@(@(B %..(4??+<<!GGPPQWX%%GG9C0A0A)0L/MMbcnotcubv w$%J:,VXY]X^ _  	& 	
 ##l&!X&&&(	 $ 
 	&&g[ -eers~e  eA  ACD	
 r   c                   | j                   sJ d| _         t        j                  j                  |      }| j	                  |d| j                  | |dd            }d| _         | j                  j                  |      }t        j                         }| j                  r+|j                  | j                  || j                               t        |t        t         f      rY| j                  j#                  t%        || j'                  |||j)                  |      |d|j+                                            nt        |t,              sJ |j.                  }t1        |      rZ|j2                  Nt5        d |j2                  D              s2dj7                  t9        t:        |j2                              }	|d|	 d	z  }| j                  j#                  t%        |d
| d| d| d|j<                   d		             |j?                          y )NFT)r   rI  r  rE  r  r  c              3  8   K   | ]  }t        |      d k(    ywri  r  r  s     r   r  z/TritonKernel.store_reduction.<locals>.<genexpr>n  rj  rk  r  rl  r  r)  r  rw  ) rJ  rL   rA  r*  r  rU  r  r  rr  rs  r  rt  ru  ry  r   r  r  r   rU   r,  r  ru  r   r   r   r   rC  r  r  r   r  rv  )
r   r   r   r  rI  r  r   r{  ry  rz  s
             r   store_reductionzTritonKernel.store_reduction>  s    $$$$ %!!$'==&*&H&H	 'I ' ! 	
 !%iit$))+
%%$$,,T43G3GH h2I JK  **55  ,+H,C,C,E+HI	 h888#--L%e,KK+?5;;??"iiC(=>.Q ??  **uDc%8CTCTBUUVW 	r   c                N   t               j                  d       t               t        d      D cg c]*  t	        fdt        t        ||            D              , }}dj                  d t        j                  j                  |      D              }j                  d| d       t               dd	d
lm} d	dlm}  |        |        G fddt               }	j#                         5  t%        j&                   |	             5   || }
dj                  d |
D              }
j                  d|
        d d d        d d d        | j(                  j+                  j-                               S c c}w # 1 sw Y   AxY w# 1 sw Y   ExY w)Nz@triton.jitr!   c              3  r   K   | ].  \  }\  }}j                  d  d| ||j                         0 yw)rf  r  r  N)r  r   )r   nr  rI  r   r  s       r   r  z,TritonKernel._lift_helper.<locals>.<genexpr>  s@      %A~u s1#Qqc]%u{{Ks   47r  c              3  2   K   | ]  }t        |        y wr   r  r  s     r   r  z,TritonKernel._lift_helper.<locals>.<genexpr>  s     Rc!fRr  zdef {name}():r  r   rn   )ShapePropagationOpsHandlerc                  4    e Zd Z	 	 	 	 	 	 	 	 d fdZy)+TritonKernel._lift_helper.<locals>.CSEProxyc                    	d| z  	 t        |      |i |} t        |      |i |}j                   t        
|      |i |||      S )Nr  r  )r  r  )r   r   r  re  output_dtypeoutput_shaper   dtype_handlerhelperhelper_name	overridesshape_handlers         r   _defaultz4TritonKernel._lift_helper.<locals>.CSEProxy._default  s     4&z) w!   # " #
 w!   # " #
 ||,GIt,d=f=&&	 $  r   N)r   r   r  ztuple[Any, ...]re  r  r   r	   )r   r   r   rE  )r   r@  rA  rB  rC  rD  s   r   CSEProxyr<    s-    '6@N r   rF  c              3  2   K   | ]  }t        |        y wr   r  )r   r  s     r   r  z,TritonKernel._lift_helper.<locals>.<genexpr>  s     BFBr  return r  )rV   r   rS   rT  r   r  r  r  r}  r~  from_iterabler  rl  ro   !torch._inductor.shape_propagationr:  r,   r  rL   set_ops_handlerro  r  r   )r   r  r  dtypesr  r  	signaturero   r:  rF  outputsr   r@  rA  rB  rC  rD  s       `      @@@@@@r   _lift_helperzTritonKernel._lift_helper|  sy   
  !'e 1X

 	  )23vv3F)G 
 
 IIRioo.K.KD.QRR	=267#%	 *PP2424	 	~ 	0 ]]_ 	2a//
; 	2$iGiiB'BBGwwi01	2 	2
 $$(():k(RRk
`	2 	2 	2 	2s)   /F
F2FFF	FF$c                B     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d       g }g }t        d |D              }t        j                   j                  j                   j                        } j                  |||      } j                          j                  z
  }	t!        ||      D ]U  \  }
} j                  j                   j                  |
 dt#        |       d||
j$                        } j                  j                   j                  d| d	 j'                          d|t         j)                                     }
|j+                  |
       t-        |      } j.                  rǉ j)                         }d
|d<    j                  j1                  ||      }dd	j3                  |       d}|j4                  rdnd} j6                  j9                  | d| d	| d	| d       |j+                  |       X d  fd} |d |       d|	 d	| d|||      } j.                  sd }|D cg c]*  } |d| dt;        |j<                         ||            , }} |t        |      t        |            } |t        |      |      }t!        ||      D cg c]+  \  }} |d| d	| d|j<                  |j$                        - }}}t!        |||      D ]*  \  }}} j                  j9                  | d| d	| d       , n|}|D ]$  }t?        |t@              sJ t        |      |_!        & t        |      S c c}w c c}}w )z:
        Perform an associative scan on 'values'.
        TODOc              3  :   K   | ]  }|j                    d   ywr  r  r  s     r   r  z$TritonKernel.scan.<locals>.<genexpr>  r  r  z(ops.scan not supported inside ops.maskedc              3  2   K   | ]  }t        |        y wr   rH   r   rI  s     r   r  z$TritonKernel.scan.<locals>.<genexpr>       Fe*51Fr  r{  r  r  r  r  r   r<  r  r  zfloat('nan')z-1r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   ywr  Nr   r   r  s     r   r  z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>       <EugQK<r  r  r  s    r   csvzTritonKernel.scan.<locals>.csv      88<V<<<r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )Nr  c              3  T   K   | ]  }j                   j                  |       ! y wr   r   containsr   r  r   s     r   r  z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>        LI488$$Y/L   %(r  r  r   rT  rC  r   r`  r  r  r   r  r   r   r  r   )r6  r  r  rL  r7  r  
cache_keysr  rI  r  result_varsrD  r^  r   s               r   cse_multiplez'TritonKernel.scan.<locals>.cse_multiple  s   FA;@8DaTF"QCr%1DJDLLLAKLIY/LL '*&&&9"UE e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% EL   D"D91D!ztl.associative_scan((rw  c                T    | j                   y t        | j                         }d|d<   |S )Nr   r<  )r   rR  )r   r   s     r   _partial_scan_shapez.TritonKernel.scan.<locals>._partial_scan_shape  s*    99$ OE #E"I Lr   ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)ztl.where(roffset > 0, z = tl.where(roffset > 0, )"rJ  r  r   rV  r  r  r@  r   r  r  r   r  r  rO  r   rI  r  rJ  r   r  r  r  rU  rS  r  r  rW  r  r   rH   rI  r   r\  r   ) r   rL  r   r  r  broadcasted_valuesaccumulatorscse_computecombine_helper_fnrX  r  rI  value_dtyper  reduced_sizer  reduced_size_strr  rj  partial_scan_varsrm  partial_scan_varpartial_reduce_vars	accs_nextfull_scan_vars	full_scanpartial_scanri  acc_nextpartial_reducerD  r^  s    `                              @r   scanzTritonKernel.scan  s    $$$$--5v5-MD<L<LMM% u??N$NN"FvFF''(9(94<<H --j&&I%%'$*A*AA/ 	1LE5((++'1%89;kk	 , K HH%%";-r$2E2E2G1HJD0023	 & E %%e,&u-H,,#335#&R "hhooEoN%&tyy'>&?q#A ,1,C,C.		##"m;/?.@7)2hZWXY ##K09	1<	=	&$ )#C(:$;#<CuBGXFYYZ[	
 ((! ):# % 12B1CCtu-.>.D.DE-.>?# # #5#6>Q8RSI'l(;=NON 03>CT/U ,I| ,YKr,qI&,,&,,K  :=<)<: 5+~ &&"m#<XJbHXXYZ ,K% 	5Jj*;<<<#-e#4J 	5 [!!?#s   /N20Nc                ,     j                   sJ  j                  rJ d       t        d  j                  D              } j	                  |       t        |      } j                  rJ d        j                  sJ d       t        j                   j                  j                   j                        } j                          j                  z
  }t        d |D              }t!        |      t!        |      k(  sJ t#        |      D 	cg c]?  \  }}	 |d|	 d j%                          d||   t         j'                               	      A }
}}	d
  fd} j                  d   j(                  sJ  j+                   j                  d         rdnd}t!        |      dk(  r'd|
d    d|
d    d| d| d| d| d} |||
||      }nt-        d      t/        ||      D ]  \  }}||_        |j2                  |_         t        |      S c c}	}w )NrQ  c              3  :   K   | ]  }|j                    d   ywr  r  r  s     r   r  z$TritonKernel.sort.<locals>.<genexpr>F  r  r  z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc              3  2   K   | ]  }t        |        y wr   rT  rU  s     r   r  z$TritonKernel.sort.<locals>.<genexpr>Q  rV  r  r  r  r  r  c                2    dj                  d | D              S )Nr  c              3  &   K   | ]	  }| d   ywrY  r   rZ  s     r   r  z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>]  r[  r  r\  r]  s    r   r^  zTritonKernel.sort.<locals>.csv\  r_  r   c                N   t        |      }t        |      D cg c]  }|  d| d|  }}t        fd|D              r)|D cg c]  }j                  j	                  |       c}S t        ||      D 	cg c],  \  }}	j                  j                  ||	j                        . }
}}	j                  j                   |
       d|         t        |
|      D ]*  \  }}|r||_
        j                  j                  ||       , t        |
      S c c}w c c}w c c}	}w )Nr  c              3  T   K   | ]  }j                   j                  |       ! y wr   rb  rd  s     r   r  z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>b  re  rf  r  r  rg  )r6  rn  r  rL  r7  r  rh  r  rI  r  ri  rD  r^  r   s               r   rj  z'TritonKernel.sort.<locals>.cse_multiple_  s   &'A;@8DaTF"QCr%1DJDLLLAKLIY/LL %(0B$C E5 e5;;?K  LL""{#$Cv. *-[*)E 4%
I+0J(Y
34 %% ELrk  r<  r   rnumelr!   ztriton_helpers.sort_with_index(r   rN   z	, stable=z, descending=zUnhandled sort)rJ  r  r   rV  r  r  r@  rS  r  r  r   r  r  r   rI  r   r   r  r  r  rM  r  rQ  r  r   ra  )r   rL  r  stable
descendingr  rp  rX  r  r  rn  rj  r  r6  ri  rD  	input_varr^  s   `                @r   r  zTritonKernel.sort=  s@    $$$$--5v5-MD<L<LMM% u??N$NN"(( 	
A	
(  ''(9(94<<H%%'$*A*AAFvFF6{c&k))) &f-
 5 "5'D,?,?,A+B!DQiD0023
 
	=	&$ #00002243C3CB3GHhv;!12DQ2G1HK]^_K`Ja b82cU)F8=AO  't-?OK !122%(f%= 	1!J	#(J  ) 0 0J	1 [!!]
s   AHc                    | j                   sy|j                  | j                          | j                   j                          | j                  j                          y)z
        Generate the output from prologue. This should be
        extracted from the subgraph, which is why this is
        partitioned from codegen_body.
        N)rw  r   clearrv  )r   codes     r   codegen_prologuezTritonKernel.codegen_prologue  s@     }}DMM"!!#r   c                   | j                   s=| j                  s1| j                  s%| j                  s| j                  s| j
                  sy| j                  D cg c]  }|j                  s| }}| j                  r| j                  sJ d       i }t        | j                        D ]  \  }}|j                  }t        j                  j                  |t         j"                        }| j%                  t&        |      }d| }| j(                  j+                  | d| d       | j,                  j/                  |t         j"                  d      ||<    | j(                  j+                  d       | j(                  j+                  d	       | j(                  j1                  d
      5  | j3                         sZ| j                  d   }	|	j4                  dk(  sJ |	j4                  }
| j(                  j+                  |
 d|	j6                   d|
 d       | j(                  j9                  | j                          | j(                  j;                  dg       | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j
                         t        | j                        D ]  \  }}|j<                  }d| }t        j>                  |j                  t         j"                        }tA        |j                        }| j,                  jC                  | j(                  | d| d|jD                  d      }ddl#}|jH                  jJ                  jM                  | d| j(                        5   |||   |      }ddd       | j(                  j+                  | d         	 ddd       tO        tQ        | j                              D ]$  }| j(                  j+                  d| d| d       & n| jR                  rWtQ        |      dkD  rHt        |      D ]  \  }}| j(                  j1                  |      5  |j4                  }| jT                  rdnd}| jT                  rdn| d}| j(                  j+                  d| d| d| d|jW                          d 	       ddd       | j(                  j1                  |d
z         5  | jY                  || j(                         ddd        | j(                  j1                  tQ        |            5  | j[                  | j(                         | j(                  j9                  | j                          | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         ddd       t]        g t        |            D ]o  \  }}| j(                  j1                  |d
z         5  | j^                  |j`                     jc                         D ]  \  }}|tQ        |      d
z
  k  rs||d
z      }| j^                  |j`                     |   }td        jg                  |      }ti        |jj                  |      }tm        ||      D cg c]  \  }}|||z  z
   }}}| j(                  j+                  to        | jp                  |   | d!| dtr        jt                  jw                  |       d"              	 ddd       | j,                  jy                  | jz                         |j}                          r n| j(                  j9                  | j                          | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | j(                  j9                  | j                         | jT                  rb| j                  s| j
                  rJ| j~                   d#}| j(                  j9                  d$| d%d&'       | j                  j                          | j                  s%| j(                  j9                  | j
                         | j                   j                          | j                  j                          | j                  j                          | j                  j                          | j                  j                          | j
                  j                          yc c}w # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   hxY w# 1 sw Y   xY wc c}}w # 1 sw Y   xY w)(a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nz1Mix order reduction requires persistent reductionaccumz = tl.full([R0_BLOCK], z, tl.float32)[None, :])r   R0_BLOCKr  z/split_size = min(RSPLIT_SIZE, xnumel - xoffset)z@for _ in tl.range(0, split_size, XBLOCK, num_stages=NUM_STAGES):rN   )r   r   r?  mask =  < rT  zxindex += XBLOCKr  z, 0))r  r  r  z&tl.store(ws_ptr + (tl.program_id(0) + z3 * tl.num_programs(0)) * r0_numel + r0_index, accumz
, r0_mask)rsplit_startr.  
rsplit_endzfor zoffset in range(r  zBLOCK):z = tl.advance(r  z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(r  Tr,  )Cindexing_coder:  rp  r  rx  ry  rV  r  mix_order_reductionrS  r  rI  r   r#   r  r  r   r  r  r`   r  r   r   r  r  r  r   r   r   
writelinesr  r  r   r  rI  unittestmockpatchobjectrT  r   rJ  r  r   r  r  rS  r  r   r|  r   r   r   rT  r  rU   r~  rL   r   r  
invalidaterz  cache_clearr  r  r2  r  )r   r   
loop_treesaccumname2varrs  partial_accumr   r  r   entryr?  r   r   triton_reduction_functionnewvalr  r  levelr   
loop_startloop_endr  advancement	prev_treeprev_advancement
prev_blockprev_num_itercurprevsem_ptrs                                 r   codegen_bodyzTritonKernel.codegen_body  s    zz{{||%%##'+'7'7Ht4<<dH
H##,, C, M&/0M0M&N 
"]!.!=!=,,::>5;;W33M7Kse}		##f3G9<RS '+hh&7&73D '8 'd#
 II QRIIR !!!+ )?//1 ,,Q/E <<3...AII''1#WUZZLA3e(LM		  !3!34		$$*
 		  ,		  .		  -		  !5!56 +4D4Q4Q*R ?&C'--C"3%=D!#!<!<%44ekk"J 1N%441- "XX..		45Qse4@!ii+	 / F $!,,33D)TYYO ",)$/"#
 II''4&G9(=>-?')?V S!>!>?@ 		##<SEAtuxty  zD  E
 ""s:':(4 JtYY%%U%3 ![[F373M3MSVJ(,(B(B6(RWHX  II''vh&6zl"XJbQWQ]Q]Q_P``gh YY%%UQY%7 J88tyyIJ JJ !!Z!9 ...tyy9		  !3!34		  ,		  .		  -.  ((@)J*?(@A #tYY%%UQY%7 262K2K		3eg.	; !3z?Q#66(2519(=I/3/H/H )0'0), *7)E)Ei)PJ,3IOOZ,PM 25[BR1S+$-C !$d]&: :+K +
 		++( $ 8 8 C#,+^I;bI^I^_jIkHllm n!4 ##D$:$:;  "9#< IIT//0IITZZ(IIT\\*IIT[[)		//0%%""d&:&:--..ABGII33:) <    66BBD''IIT112  "

$$&""$g Ir I)? )?f J J. .,+ sw   f#*f#&Hf5f(*f5A&g g,B0g Bg/=g)Ag/(f2-f55f?g	g	g&)g//g9	c                   g }| j                         rQg }| j                  d|g        |D ]6  }t        |t              r|j	                  t        |             /t        |t              rit        j                  j                  j                  |j                  | j                  t        j                        }|j	                  t        |             t        |t        j                         r`t        j                  j                  j                  || j                  t        j                        }|j	                  t        |             "t#        dt%        |              |S )Nr   r  fallbackz!Unsupported numel argument type: )r  add_numel_to_call_argsr   r  r  r   rl   rL   rA  r:  	size_hint
inner_exprr  r"   unbacked_symint_fallbackr   r   rB  r  )r   r  
numel_argsrf  hints        r   kernel_benchmark_extra_argsz(TritonKernel.kernel_benchmark_extra_args9  s   !+-J''J;! Vc3'KKC)_577++55&*&8&8!'!@!@ 6 D
 KKD	*UZZ077++55&*&8&8!'!@!@ 6 D
 KKD	*$'Hc%TUU%V& r   c                   t               }| j                  j                         \  }}}}|j                  g d       |j	                         5  t        j                         }g }t        ||      D ]c  \  }	}
dt        |       }t        j                  j                  |	      }|rt        j                  j                  j                  |j                         | j                  t         j"                        }t        j                  j                  j                  |j%                         | j                  t         j"                        }|j'                  | d| d| d|j)                          d|j+                          d
       n;|	t        j                  j,                  v rt        j                  j,                  |	   }t        j                  j                  j                  |j/                         | j                  t         j"                        }t        j                  j                  j                  |j1                         | j                  t         j"                        }|j'                  | d| d| d|j2                   d|j4                   d
       n(t7        |
t8              rut        j                  j                  j;                  |
j<                  | j                  t         j"                        }d	|
j>                  v rd
}|j'                  | d|        nt7        |
t@              rt        j                  jC                         }t        j                  j                  j;                  |
j                  | j                        }|j'                  | d| d| d|
j4                   d       ntE        d|	       |jG                  |       f |jI                  | jK                                |j'                  ddjM                  |       d       ddd       |j                  g d       t        j                  jC                         }|jN                  }|j	                         5  |j'                  dt        j                  jP                  jS                  |       d       |j	                         5  |j'                  t        j                  jP                  jU                  |             d| }|j'                  | d| d       |j'                  tW        tX        jZ                         d| d       ddd       ddd       |j                  g d       |j	                         5  |j'                  dt        j                  jP                  jS                  |       d       |j	                         5  |j'                  t        j                  jP                  jU                  |             |j'                  dtW        tX        jZ                         d       ddd       ddd       |j                  g d       |j	                         5  |j'                  d       |j'                  d       |j'                  d       |j'                  dt        j                  jC                         j\                   d       |j'                  d |        |j'                  d!       |j'                  d"       ddd       |S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   |S xY w)#a  
        Generates Python code for benchmarking this Triton kernel.
        - Creates example inputs (random tensors, constants, sizes).
        - Runs the kernel on the current GPU/stream.
        - Prints runtime (ms) and throughput (GB/s) using `num_gb`.
        Args:
            num_gb (float): The number of gigabytes to use for throughput calculation.
        Returns:
            IndentedBuffer: A buffer containing the generated Python benchmark code.
        )r   r   zdef get_args():arg_r  z = rand_strided(r  z
, device='z	', dtype=r  r  r   r  )r  z = torch.zeros(z*Don't find the buffer or const tensor for rH  r  N)
r  zdef call(args):zwith r  streamz = get_raw_stream(z.run(*args, stream=)r  r  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r  r  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z6ms = benchmarker.benchmark(lambda: call(args), device=z	, rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))/rV   r  python_argdefsr  r  r}  r{  r  r  rL   rA  try_get_bufferr:  
size_hintsget_sizer  r"   r  
get_strider   
get_devicer*  	constantsr  rZ  devicerI  r   r\   r  r   r   r^   rI  KeyErrorr  extendr  r  r   
device_opsdevice_guard
set_devicer   rA   KERNEL_NAMEr  )r   num_gbr\  _argdefs	call_argsrM  r  name_cnt	var_namesarg_namearg_sigvar_namer  r  rZ  const_tensorsymval_hintr  r{  r  r   stream_names                         r   codegen_kernel_benchmarkz%TritonKernel.codegen_kernel_benchmarkS  s`     !,0II,D,D,F))Y56]]_ ?	@ (HI%(I%> :+!'!$x.!12gg,,X677++66&*&8&8!'!@!@ 7 D
 WW--88(&*&8&8!'!@!@ 9 F
 $$#*$4TF"VHJs~~O_N``ijmjwjwjyizz{| !2!22#$77#4#4X#>L77++66$))+&*&8&8!'!@!@ 7 D
 WW--88$++-&*&8&8!'!@!@ 9 F
 $$#*$4TF"VHJ|ObObNcclmymm  mA  AB  C  1"#''"2"2"<"<&*&8&8!'!@!@ #= #K %4&'$$z[M%BC6WW@@BFGG,,66T5G5G 7 E $$#*OE7*VHIV]VcVcUddef #DXJO    *u:+v T==?@wtyy';&<A>??	@B 	9:<<>$$]]_ 
	uQWW%7%7%D%DU%K$LANO   GG&&11%8 !'ug.  K=0B5'!KL  ;22344G}TUV
	 	JK]]_ 	uQWW%7%7%D%DU%K$LANO   GG&&11%8   c+"9"9:;;XY		 	DE]]_ 	N R 01HIlIlInIsIsHtt}~ y12=>N	  [?	@ ?	@L 
	 
	  	 		  sf   O\/A\ <B\?\ 2A\:?A%\-$\:B%]\\	\  \*-\7	2\::]]c                    t        j                  dj                  t        j                  j
                  j                  d                  S )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        get_raw_stream)textwrapdedentr  rL   rA  r  import_get_raw_stream_asr   s    r   imports_for_benchmark_kernelz)TritonKernel.imports_for_benchmark_kernel  s:     F177%%>>?OPQ
 	
r   c                    | j                   ry| j                  ry| j                  r| j                  sJ y| j                  ryy)Nr[  r  rS  r  	pointwise)r[  r  rS  rJ  r   s    r   _get_heuristiczTritonKernel._get_heuristic  sD    !''*&&(((()""r   c                 L   t         j                  j                  j                         t        j
                  t        j                  t        j                  j                  t        j                  t        j                  t        j                  t        j                  t        j                  t        j                  j                  t        j                  j                  t        j                  j                   t        j"                  t        j$                  j&                  d} t        j(                  rt        j*                         | d<   t         j,                  j.                  d| d<   t	        j0                         rd| d<   t        j2                  rLt        j2                  | d<   t        j4                  | d<   t        j6                  | d<   t        j8                  | d	<   t        j:                  r9t        j:                  | d
<   t        j<                  | d<   t        j>                  | d<   | S )N)backend_hashassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinr  r  $are_deterministic_algorithms_enabledTis_hipr  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions) r   r%   _tritontriton_hash_with_backendr"   r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  *write_are_deterministic_algorithms_enabledr  r  r  r  r  r  r  r  r  r  r  )inductor_metas    r   inductor_meta_commonz!TritonKernel.inductor_meta_common  s    "KK//HHJ(.(G(G$*$?$?"(--"B"B%+%A%A$*$?$?$*$?$?"//&,&C&C%+]]%H%H%}}<<!==44#11.4.A.A.`.`
" <<::< @A ==(&*M(#)-M+&##171I1IM-.7=7U7UM348>8W8WM45FF KL ++00 56 77 <= >> CD r   c                  -. t               }i }| j                  j                         D ]  \  }}t        |      r| j                  st
        j                  j                  j                  |      }t        |t        t        j                  f      sd}nt        t        |            }|||<    ||j                  t                      t
        j                  j!                         j"                  }|dk(  r|j                  d       n|j                  d       t$        j&                  r|j                  | j)                                | j*                  j-                         \  -}	.}	t/        .      D ]  \  }
}t        |t0              st3        t        j4                  |j6                        }|t
        j                  j                  j8                  v sbt1        |j:                  t
        j                  j                  j8                  |         .|
<    t=               }| j>                  D ]  }|| j*                  j@                  v r(|jC                  | j*                  j@                  |          || j*                  jD                  v rj|t
        j                  jF                  vrN|| jF                  vr@|jC                  t3        tH        | j*                  jD                  |         jJ                         || j*                  jL                  v s| j*                  jL                  |   }t        |tN              rJ |jC                  |        tQ        -.      D ]O  \  }}t        |tR              s|jT                  tV        jX                  k(  s5|jC                  |j:                         Q t[        |      }| j]                         D ]Z  }t1        |j^                   d|j`                        }.jc                  |       -jc                  te        |j:                               \ -.fd}| jf                  D ]K  }|jh                  r| jj                  r|jl                  ) ||j^                  jo                          d       M | jp                  r |d	       | jr                  r |d
        |d       tu        .| jv                  -      }|ty        jz                  t
        j                  j!                               i t|        j~                  j$                  j                  j                  xr0 dt        | j                        v xs dt        | j                        v d}t
        j                  j                  xs t
        j                  j                  }| j                         j                  t        | j                        t        t        j                        ||| j                  | j                  | j                  | j                  | j                  d
| j                         }| jr                  r| j                  |d
<   t$        j                  st$        j                  j                  r| j                  xs | j                  |d<   t
        j                  j                  ji                         xr | jj                   }| j                  }t        | j                        dk(  }|r|r| j                  j                  | j                        }|j                  j                  j                  d   }|j                  }|#d|v rd|v r|d   t        |d   d      z  }|dk\  }n-ddldme}  | j                  j                         | j                  k(  }|j                  j                  j                  }!|j                  j                  j                  }"t
        j                  j                  j                  |!t$        j                        t        t
        j                  j                  j                  |"t$        j                        d      z  }#|#dk\  r|rt
        j                  j                  j                  | j                  j                  d      rHt
        j                  j                  j                  | j                  j                  d      r
|dk  rd|d<   | j                  r| j                  |d<   | j                  r| j                  |d <   | jp                  r| jj                  |d!<   d}$t$        j&                  st$        j                  r| j                         d"z  }$|$|$|d#<   t$        j&                  r| j                         }%|%|%|d$<   t        .      g|d%<   t               rd|d&<   t        .      D ]  }&d|d'   .|&   j:                  <    t$        j                   |d(<   || _w        | j                  | j                         | j                          | j                  D ]$  }'|j                  d)       |j                  |'       & | j                  r2d*| j                          d+| j                  j$                  d,|d-|d.	}(n| j                  r;| j                  j                         })d*| j                          d/|d0|) d,|d-|d.}(nYd)}*t        |      dk(  rt        t        .            d1k(  rd2}*nd3}*d*| j                          d/|d4|* d5|d-|d6| j                   d.}(|j                  |(       |j                  d7|xs t        t        j                          d8d4j                  d9 -D               d:       |j                         5  | j                  |       | j*                  j	                         D ]  \  }+},|j                  |+ d;|,         |j                  | j                         ddd       t$        j&                  r!|j                  | j                  |$             |j                         S # 1 sw Y   KxY w)<z
        Convert the TritonKernel from Inductor SIMD IR to triton code, including inductor triton heuristics, imports,
        metadata, and benchmarking infra.
        i    Ncpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()rT  c                    t               rj                  t        |              j                  t        | d             y )NT)is_constexpr)rG   r  rR   rP   )r  argdefsrM  s    r   add_constexpr_argz6TritonKernel.codegen_kernel.<locals>.add_constexpr_arg{  s/    -/  h!78NN78$?@r   r   r)  RSPLIT_SIZE
NUM_STAGES)
size_dtyper   ztl.dot)rM  r  r  native_matmul)
	grid_typer  kernel_namemutated_arg_namesoptimize_memrG  rq  num_load	num_storenum_reduction"has_loadstore_with_contiguous_rdimr!   r   r?  r0_rN   g       @)ReductionHint)r  g?i   i   
   Tadd_persistent_rblocktiling_scoresr\  rS  g    eAkernel_num_gbkernel_flopconfigs
launch_pdlr  enable_fp_fusionr   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=rO  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r  zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c              3  <   K   | ]  }|j                           y wr   )	full_namer  s     r   r  z.TritonKernel.codegen_kernel.<locals>.<genexpr>F  s     CcVWAKKMCcr  r9  r  )rV   rK  r|  rB   rJ  rL   rA  r:  rR  r   r  r   r   r4   r   r   rI  r  r"   benchmark_kernelr  r  r  r  r\   r
   r   r   inv_precomputed_replacementsr   r   	mutationsinput_buffersr  ro  removed_buffersrW   
inner_nameoutput_buffersr[   r  r^   	zero_moder_   ZERO_ON_CALLr  r   r   rT  r  rP   rV  rM  rS  r   r   r  r  rk   r  r0   r]  r   r  r   r  r   r  r  is_inferenceis_backward_get_grid_typer   setr  rA   DESCRIPTIVE_NAMErG  rq  r
  r  r  r  rsplit_sizer  r  r  r  r  r   rL  r  r   r  memory_stats
persistentmemoryrX  count_per_threadrz   torch._inductor.runtime.hintsr  get_reduction_hintINNERloopedbytesr  r  statically_known_leqreduction_numelstatically_known_gtr\  r  estimate_kernel_num_bytesestimate_flopsrg   r  rh   emulate_precision_castsr  r  r  ro  r   r[  r  ri   r  r  r  r  codegen_static_numelsaliasesr  r   )/r   r   r  r  r   rT  
numel_hintr  device_typer  r  rf  r  mutated_argsmutationmutation_argargnamer   sizeargr  triton_meta_signaturer  r	  r  
looped_redr  	two_d_redr)  	dim_statsmem_ops_per_threadr_coalesce_ratiocontiguous_redr  
looped_mempersistent_memsaved_bytes_ratior  flopsarg_numrA  heuristics_linereduction_hint	tile_hintoldnewr   rM  s/                                                @@r   codegen_kernelzTritonKernel.codegen_kernel  s
    
![[..0 	+MFE"6*43H3H))77>Jj3*>? !	+C
O<	!*Jv)	+, <KK134''==?DDKe#@A@A&&D==?@#'99#;#;#= Iq	* 	FAs#w' ellCHH5QWW--JJJ#*!''"2"2"O"OPV"W$IaL	 )3 	/H499222  !8!8!BCDII555AGG$;$;;D$8$88  )B)B8)LMXX 499333#yy77A%lJ???  .	/6  3 	/LGS3-MM%6%C%CC  .	/ l+++- 	2DU3TZZ@GW%NN77<<01	2	A $$ 	=D  T%>%>&!2!2!4 5U;<	= %%h'##m,l+ 1$"2"2G!
 /&--agg.Q.Q.ST&&--;; RTYY/P8s4<<?P3P'
 ww++Bqww/B/B ,,.77!$"5"56{;;<!-( $ 5 5!//
 '')
 ##+/+;+;M-(6#6#6#U#U22 766 >? XX&&335Wd>W>W:W
**$)	)==55dkkBL$//66::1=I!*!;!; )=(]* $1#7#mC>PRS:T#T !1S!8G MM446-:M:MM  &,,3399J)44;;AAN ! 0 0 : :V%D%D !; !  **"V-L-L +  	! "S("GG$$99MM115 GG$$88MM114 '",9=56-1-?-?M/*##373K3KM/0%%484M4MM01""f&>&>335;F!17o.""'')E /4m,"+I"6!7I(,K% +95 	BG@AK$Yw%7%<%<=	B.4.L.L*L&'&dii(++ 	 FNN2KK	  #$$($7$7$9#: ; --447 8!!, 0##0"3 4O ""!]]==?N#$$($7$7$9#: ;  *~ .$$2#3 4!!, 0##0"3 4	O I:!#/	:;q@ <I =I#$$($7$7$9#: ;  *~R	{ ;!!, 0##0"3 4))-)A)A(B C	O 	O$473{6678$))Cc[bCc:c9ddfg	
 [[] 	#&&t, II--/ 1S#c#/01KK		"		# ""KK55f=>}}	# 	#s   A'q  q	c                   t         j                  j                  j                  |       } t	        | t
        j                  t        f      rt        |       }t        |      }|S d}t         j                  j                  j                  | |      sC|dkD  rt        d|        |dz  }t         j                  j                  j                  | |      sC|S )Nr!   i @  z!Failed to find static RBLOCK for )rL   rA  r:  simplifyr   r   r   r  r4   r2  rB  )r  r  s     r   rU  z#TritonKernel._get_persistent_RBLOCKS  s    !!**62fu}}c23f+C!#&C 
 Cgg&&;;FCH?$'H%QRRq gg&&;;FCH
 Jr   c                N    	 t         j                  |        y# t        $ r Y yw xY w)NTF)rD  rU  rB  )r  s    r   has_persistent_RBLOCKz"TritonKernel.has_persistent_RBLOCKd  s*    	//7 		s    	$$c                   d
d}| j                   D ]e  }|j                  r| j                  rdt        j                  j
                  j                  |j                        } ||      r)|j                  |j                   dt        |              |j                  r| j                  r| j                  r1| j                  | j                  |j                              }d| d}n3| j                  |j                        }| j                   rt#        |d      }|j                  |j                  j%                          d|        |j                  dk(  sG| j&                  sU|j                  d       h y	)a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        c                B    t        | t        j                  t        f      S r   )r   r   r   r  r   s    r   is_static_integerz=TritonKernel.codegen_static_numels.<locals>.is_static_integer~  s    dU]]C$899r   znumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)rO  zBLOCK: tl.constexpr = r?  zXBLOCK: tl.constexpr = 1N)r   r   r   r   )rV  rM  rJ  rL   rA  r:  rT  rT  r   r   r  rS  r  rp  r1  rU  r  rz   r   rG  )r   r  rY  r   simplified_tree_numelrT  r  s          r   r8  z"TritonKernel.codegen_static_numelsl  s,   $	: $$ 	;D$$(=(=()(8(8(A(A$**(M%$%:;NNdkk](3?T;U:V#WX  T%>%>-- JJt';';DJJ'GHEFugMfgC55djjAC,,!#rl$++"3"3"5!66LSERS{{c!dmm9:'	;r   c                   t        | j                  D cg c]  }t        |j                          c}      }| j                  r|dk(  sJ t
        j                  S | j                  r|dk(  sJ t
        j                  S |dk(  rt
        j                  S |dk(  rIt        t        | j                  | j                              rt
        j                  S t
        j                  S |dk(  rt
        j                  S t!        d|       c c}w )NrN   r!   r   z"Unsupported number of dimensions: )r  rV  r  rM  r  r-   MixOrderReductionGridr  CooperativeReductionGridGrid1Dry   r  rm  Grid2DWithYZOverflowGrid2DGrid3DrB  )r   r   r7  s      r   r%  zTritonKernel._get_grid_type  s    8H8HI***+IJ##6M6$:::''6M6$===!V$+++!V3t22D4D4DEF(===$+++!V$+++=aSABB Js   Dc                   | j                   D ]  }t        |j                  t        j                  t        j
                  f      r|j                  }n*t        j                  j                  j                  ||      }|j                  r| j                  s|j                  |       |j                  t        |              y r   )rV  r   rT  r   r   r   rL   rA  wrapper_codegenerate_numel_exprrM  rJ  r  r  )r   r   r  	arg_typesr   r   s         r   r  z#TritonKernel.add_numel_to_call_args  s    $$ 	-D$**u}}ell&CDzzww++??dK$$(=(=  &  d,	-r   c                z   t         j                  j                  }|j                          | j                  j                         \  }}}}| j                  |||       | j                  j                  D ]  }|j                  |        |j                  ||d|| j                         |r| j                          y y )NT)r   re  r  )rL   rA  rc  write_triton_header_oncer  r  r  workspace_argsgenerate_workspace_allocationgenerate_kernel_callr  deallocate_workspaces)	r   r   r  deallocate_wswrapperr  r  re  wss	            r   call_kernelzTritonKernel.call_kernel  s     ''&&((*%)YY%=%=%?"9a##D)Y?))** 	6B11"5	6 	$$(( 	% 	
 &&( r   c                   t         j                  j                  }| j                  j	                         \  }}}}t        ||      D ]w  \  }}t        |t              st         j                  j                  r|j                  d| d| d       Jd| d}|j                  |       d| d}|j                  |       y y )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rL   rA  rc  r  r  r  r   r]   cpp_wrapperr   )r   rm  r  r  arg_signaturesrf  arg_signaturer6  s           r   codegen_nan_checkzTritonKernel.codegen_nan_check  s    ''&&*.))*B*B*D'9na"%i"@ 
	,C-377&&%%TUXTYY\]`\aade )-BCD%%d+(-BCD%%d+
	,r   c                    t        |i |S r   )r\  )r   r  re  s      r   r/  zTritonKernel.create_cse_var  s     $1&11r   c                F   |j                    d| j                  | j                  |j                               }|j                  j
                  s| j                  r+|j                  dk(  r| j                  j                  |       y | j                  j                  |       y )Nr  r?  )r   rp  r1  r   rootr  r  r   r  r   r  )r   r  r6  s      r   codegen_iteration_ranges_entryz+TritonKernel.codegen_iteration_ranges_entry  sy    **SD,@,@,L!M NO ::$":":u||s?R((. II%r   c                   |j                   J | j                  |j                         }| j                  }|dk7  rd| dnd}| j                  r| j                  r|j
                  r| d}d|j                  j                          d| | S )Nr  r{  r  r   z + rsplit_startztl.arange(0, zBLOCK))r   indexing_size_strr  r  rS  rM  r   r   )r   r  r  r  ru  s        r   r  z)TritonKernel.iteration_ranges_ranges_code  s    +++%%e&6&67&&*5*C4}A&&&))""x/Fu||1134F4&IIr   c                ^    | j                   }| j                         }dg|z  }d| d| d| dS )NrN   r  r  r  )r  r   )r   r  r  r  r   r  s         r   iteration_ranges_scalar_codez)TritonKernel.iteration_ranges_scalar_code  sC     &&&&(sTz$r%;-q99r   c                $   |j                   J d|j                    d}| j                  |      r#d| d|j                   dz    d|j                    d}|j                  j                  ||      }| j                  dk7  r| d	| j                   dS |S )
Nztl.program_id(r  r  z + tl.program_id(rN   z) * tl.num_programs(r  r  r{  )r  rm  	pid_cacher`  r  )r   r  r  pids       r   iteration_ranges_get_pidz%TritonKernel.iteration_ranges_get_pid  s    ~~)))u~~.a0 &&u- cU+ENNQ,>+??STYTbTbSccefCoo!!#s+z)U$t//022
r   c                    |j                   dk(  xr[ |j                   xrL | j                   xr= t        j                  j
                  j                  |j                  t                      S r1  )	r  has_zdimr  rL   rA  r:  r2  rT  r3   )r   r  s     r   rm  z#TritonKernel.needs_yz_grid_overflow  sa    NNa YNN"Y...Y GG$$99%++~GWXX		
r   c                    | j                   r | j                   |j                          d   S t        |j                            S )Nr   )r[  r   r1   )r   r   s     r   rZ  zTritonKernel.max_block  s;    $$'7u%=>>//r   c                   | j                   r5t        j                  j                  j	                  |j
                  d      ry| j                  sy| j                  rW|j                  j                          d| j                  v r.| j                  |j                  j                          d   dk(  r6yt        j                  j                  j                  |j
                  d      ry|j                  r(| j                  r| j                  |j
                        }n9|j                  dk(  r| j                  rd}n| j                  |j                        }|j                  r| j                   r|| j#                         z  }t        j                  j                  j%                  |j
                  |      r[|j&                  dk7  xsJ |j(                  xs< t        j                  j                  j+                  |j
                  t-                     S y)NrO  Fr   rN   Tr?  )r  rL   rA  r:  statically_known_ltrT  rs  r[  r   r   rB  rM  rS  rU  rG  rZ  r  r*  ro  r  r  r2  r3   )r   r   rZ  s      r   r  zTritonKernel._has_constant_mask  s      ww33DJJC!!DKK$5$5$7#8!>$BSBS!S  DKK$5$5$7#8!>?1Dww77

AF !:!:33DJJ?I[[CDMMIt{{3I!;!;!DOO$55I 7788YO" W==W77##88^EUV r   c                d    | j                   d   }|j                  dk(  sJ | j                  |      S )Nr   r?  )rV  r   r  )r   xtrees     r   r  z TritonKernel._has_constant_xmaskI  s5      #||s"""&&u--r   c                    | j                   D ]2  }| j                  |      s|j                  |j                   d       4 |j                  d       y )Nr  r   )rV  r  r  r   )r   r   r   s      r   r  zTritonKernel.filter_masksN  sN    $$ 	8D&&t,!!T[[M"67	8
 	&!r   c                    t        t        j                        d | j                   D cg c]  }t        |    c}S c c}w r   )rR  r   r   rI  r   )r   r   s     r   get_reduction_prefixesz#TritonKernel.get_reduction_prefixesV  sB     ]::;<Ud>U>UV
 t
 	
 
s   <c                   | j                   D cg c]  }|j                  s| }}dj                  t        d |D                    }|j	                  d| j                  |              | j                   D cg c]+  }|j                  rt        j                  |j                     - }}t        |      }|j	                  d| j                  |              yc c}w c c}w )z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        rA  c              3  :   K   | ]  }|j                    d   yw)rT  Nr  r  s     r   r  z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>c  s     "UTdkk]%#8"Ur  z	rnumel = zRBLOCK: tl.constexpr = N)
rV  rM  r  r  r   rp  r   r   r   rD   )r   r7  r   reduction_treesr  	rn_blocksr  s          r   r  z%TritonKernel.codegen_reduction_numels]  s    
 -1,<,<RD@Q@Q4RRF"U_"UUV	$**V"4!567
 ((
   %%dii0
	 

 y)/

60B/CDE S

s   CC90Cc                |    | j                         }|D cg c]  }t        j                  | | fi | c}S c c}w )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        )r  r   r   )r   ru  re  rn_prefixesr   s        r   r  z#TritonKernel._get_reduction_symbolso  s=     113JUVxx0;F;VVVs   !9c                    | j                         }| j                  ddd      }t        t        |      dz
        D cg c]  }t	        ||dz   d        c}t        j                  d      gz   S c c}w )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        rT  Tr   rN   N)r  r  rT  r   rD   r   r   )r   r  	rn_numelsrs  s       r   _get_reduction_index_coeffsz(TritonKernel._get_reduction_index_coeffsv  s{     113//PT/U	;@[AQTUAU;V
47M)C!GI./
]]1 	 
s   A0c                :    | j                         }t        ||      S )zK
        Compute linear reduction indices from N dimensional ones.
        )r  rC   )r   
multi_indscoeffss      r   r  z'TritonKernel._flatten_reduction_indices  s     113,,r   c                $   | j                  ddd      }| j                  ddd      }| j                  |      }|j                  d| j                  |              | j                  |      }|j                  d| j                  |              y)zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r  r  r   r  )r   r7  
rn_offsetsrn_indsrc  rindexs         r   r  z&TritonKernel.codegen_reduction_indices  s    
 00d 1 

 --gtQU-V 11*=
4#4#4W#=">?@009	$"3"3F";!<=>r   c                   |j                   }|j                  r%|j                  |j                   d| d| d       n|j                  D|j                  |j                   d| j                  |              |j                  | d       n|j                  | d| j                  |       }n| j                  || d      }| j                  s|j                          dnd}|j                  | d| j                  |       d	| |j                   d| g       | j                  |      r!|j                  | j                  |             y |d
k(  r| j                  s%|j                  | d|j                   d| d       y y )Nr  z	offset + r  z
offset = 0r   r   r  z	offset = rA  r?  r  r  rT  )r   r  r   r   r  r  r   r|  r  r   r  r  r  create_constant_mask)r   r  r  r?  r6  
block_sizes         r   r  z,TritonKernel.iteration_ranges_codegen_header  s    LL==NNejj\QCy4@A^^#NNejj\T-N-Nu-U,VWXNNaS
+,+Id&G&G&N%OP881#VM ,0+C+C1779+U#  OOc4#@#@#G"HJ<Xzzl#dV, ""5)NN444U;<s(t77NNaS

|3qc?@  8r   )r   TNN)r  zdict[str, sympy.Expr]r[  zOptional[FixedTritonConfig]r  zOptional[int]r   r   r  rI  rh  r   r   r  )r   r   r  z Optional[Union[str, tuple[str]]]r  z!Optional[TMACompatibilityChecker])r   )r   r   r   r   r  z/Union[BlockPtrOptions, TensorDescriptorOptions]r   ztuple[str, str])r   r   r  r   r4  r   r   r   )r   r   r  r  )r   r   r   r   r   )
r   r   r   r   r  rT   rw  rK   r   r   r  )r  Optional[CSEVariable]NN)r  rT   r  z.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]r  rT   r  rh  r  r   r  z Optional[tuple[str, sympy.Expr]]r  r  r   rT   )r   ztuple[str, BlockShapeType])r  rT   rI  rh  r   rT   )
rI  rh  r  rh  r   rJ   r  +Union[CSEVariable, tuple[CSEVariable, ...]]r   r  )rI  rh  r  )r   r   r   r   r  rT   )r  tuple[CSEVariable, ...]rL  tuple[torch.dtype, ...]r   r   )rL  r  r   zUCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  r  r   r  )
rL  r  r  r  r  r   r  r   r   r  )r  rV   )r   r  )r  zOptional[float]r   rV   )r   z type[triton_heuristics.GridExpr]r  )r   r   r  zOptional[IRNode]rl  r   )r   r\  )r  rb   )r  rc   r   r   )r  rc   r  r	   r   r   )r  rc   r   r   )r   r   r   r  )r   rc   r   r   )r   r   r   r   )r7  rV   r   r   )ru  r   r   zlist[sympy.Symbol]r  )r  r@  r   r   )r  rc   r  rV   r   r   )Zr   r   r   r   r  rC  r   r0  rp  r  rC  rU  rq  r`  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r,  r8  r3  r>  rG  r  r  r|  r  ru  r  r  r  r  r  r  r  r  r  r  r  r*  r  r4  rO  r~  r  r  r  r  r  r  r  r  rR  rU  rV  r8  r%  r  ro  rt  r/  rx  r  r|  r  rm  rZ  r  r  r  r:   r  r  r  r  r  r  r  ri  rj  s   @r   rD  rD  	  s    &I%%).E&.O$;!JN7N
 48'+09%09
 209 %09 
09d : :6 
 

"

#J*:0


 " " 8<GKD
D
 5	D
 $ED
V f'f' f' B	f' 
f'PX:MM M 	M
 M40
U

:H
xv SWYY *Y3>YFOY	YvI
& 480477 C7 &	7
 $7 7 17 .7 
7r.
9
(
1<
	
(hh h &	h
 ;h 
5hT&DO&"
0F
P#0&!
	B<< < 	<|>S1>S;R>S	>S@"'"
" (" 
!"BD"'D" (D" 	D"
 D" 
!D"L$g%R4||

 , ,\||	     (;TC$
- OS))/)GK),,2	&J:(:14:	:
0
/b.
" 
 
F$W 
 
-? A(A0>A	Ar   rD  c            
      b    e Zd ZU eZded<    eej                  ej                  ej                  ej                  ej                  ej                  ej                  ej                  g      Zd fdZedd       ZddZd ZdddZ	 d	 	 	 ddZ	 	 	 	 	 	 	 	 dd	Z	 	 	 	 	 	 	 	 dd
Zd Z xZS )TritonSchedulingz	type[Any]kernel_typec                    t         |   |       |t        |d      sy |j                  D ]$  }t	        |t
        t        f      st        |_        & y )Nr  )	r_  r`  r   r  r   r8   r6   debug_triton_codedebug_device_str)r   	schedulerr  rb  s      r   r`  zTritonScheduling.__init__  sM    #GIw$?OO 	:D$0B CD(9%	:r   c                    t         j                  j                  st         j                  j                  r't	        g | j
                  t        j                        S | j
                  S r   )r"   r   cooperative_reductionsforce_cooperative_reductionsr   backend_featuresrQ   REDUCE_TO_SINGLE_ELEMENT)r   r  s     r   get_backend_featuresz%TritonScheduling.get_backend_features  sR     MM00}}99P#&&P(O(OP  ###r   c                  
 t         j                  j                  }t        ||      \  }}|r|j	                  |       t
        j                  rtddlm}m	
 t        
fd|D              sX|D cg c]  }t        ||      r|j                           }}|j	                  |j                   ddj                  |              |rt        ||      }	|j!                  ||	       y y c c}w )Nr   )r5   ForeachKernelSchedulerNodec              3  6   K   | ]  }t        |        y wr   )r   )r   r7  r  s     r   r  z3TritonScheduling.codegen_comment.<locals>.<genexpr>  s      >?
189s   z Fused node name list: r  )rL   rA  rc  r?   make_commentr"   debug_fusiontorch._inductor.schedulerr5   r  ry   r   get_namecommentr  r+   write_provenance_debug_handle)r   node_scheduler  rm  origins_detailed_originsr5   r7  
node_namesdebug_handler  s             @r   codegen_commentz TritonScheduling.codegen_comment  s    ''&&%8%P""  )
  CP  +!!%67 JJL
 
 $$''>tyy?T>UV BL 11+|L s   .#C$c                L   t         j                  j                  }||j                  v r|j                  |   }|S t        j
                  j                  r$t        |t        j
                  j                        nd}t        |      d d }dj                  d|||j                         g      }t        j                  j                  rt        j                  j                   d| }||j                  |<   t        j
                  j                  r|nd}|j                  t        t         j"                        |      }|j                  t        t         j$                        |      }|j                  dd      }t'        t)        |j+                               d      \  }	}
}t-               }t.        j1                         rt.        j                  ||       |j3                  d	|d
       |j5                  |d       t         j                  j7                         }|j3                  d|j8                   d       d| }t;        ||      \  }}|d|z   dz   |z   z  }|j=                  ||j?                         |       tA        jB                  d      rtA        jD                  |||       |S )Nr   r   r  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr,  z''', device_str='z')z# kernel path: r  kernel_metadata)#rL   rA  rc  src_to_kernelr"   r   descriptive_namesr>   rM   r  next_kernel_suffixaot_inductormodel_name_for_generated_filesunique_kernel_namesrX  r   rA   r'  r  r(   r'   r-  rV   async_compileuse_process_poolr   r   rI  r  r?   define_kernelr   r$   is_metric_table_enabledlog_kernel_metadata)r   src_coder  r   rm  r  
fused_namekernel_category	subs_name	_basenamer  kernel_pathcompile_wrapperr  metadata_commentr  detailed_originss                    r   r  zTritonScheduling.define_kernel  sg   ''&&w,,,!//9Kt m ==22 &mV]]5T5TU 
 AJ2ANO((?J8R8R8TUK ""AA "(!4!4!S!S TTUVaUbc /:G!!(+'-}}'H'HiI
  ''K,H,H(I;WH''K,C,C(DiPH  ''s;H(08>>;K1Ld(S%Iq+,.O--/ $$Y9%%(=i]%&PQ""84"8WW@@BN%%(9.:M:M9Nb&QR!0>(;M7(S%G%w 58H HH!!_5579I ../@A++KhOr   c                    | j                  |d      }t        j                  |      }| j                  ||t	        d |D                    S )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r  c              3  <   K   | ]  }|j                           y wr   r  r   r7  s     r   r  z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>D  s     :WA1::<:Wr  )r  )generate_kernel_code_from_nodesr)   r  benchmark_codegened_moduler   )r   r  n_spills_thresholdr  r  s        r   benchmark_fused_nodesz&TritonScheduling.benchmark_fused_nodes<  sV    
 77PT7Ux(..#
:WQV:W0W / 
 	
r   c           	     8  
 t        t        j                  j                        }t	               5  |j                  t        j                  j                               5  dfdfd}fd}||nt        dg      }t        j                  d|j                          |        j                  fcddd       cddd       S j                         
j                  j                  	   j                  
 d          j(                  }t+        |      d
k(  sJ |d   j,                  |kD  rt'        d	      nyt        j                  j                         }	t/        j0                  
fd|	      t+        j2                        dkD  r't/        j0                  
fdt5        |	            z
  t        j                  d|        |        j                  fcddd       cddd       S # t        $ rn}t         j"                  j$                  r t        j                  d||       t'        d	       |        j                  fcY d}~cddd       cddd       S d}~ww xY w# 1 sw Y   nxY wddd       y# 1 sw Y   yxY w)z$Benchmark an already compiled moduleNc                 ~     j                   J t        j                  j                   j                         d   dz   S Nr   z.kernel_perf__file__ospathsplitextr  s   r   cache_file_pathzDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_pathR  s6    ||///ww''5a8>IIr   c                 >            } t        | t                     y r   r*   r   )r  r  mss    r   store_cachez@TritonScheduling.benchmark_codegened_module.<locals>.store_cacheV  s    &(T3r7+r   c                             } t         j                  j                  |       r.t        |       5 }t	        |j                               cd d d        S y # 1 sw Y   y xY wr   )r  r  existsopenr  readr  fdr  s     r   
load_cachez?TritonScheduling.benchmark_codegened_module.<locals>.load_cacheZ  sM    &(77>>$'d 0r$RWWY/0 00s   AA r!  %kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrN   c                 4      j                     d         S rg  
clone_argsr  callwrapped_jit_functions   r   rB  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>       D!@!5!@!@$!G!JK r   r  c                 "     j                     S r   r  r  r  s   r   rB  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>  s     ? 4 ? ? F r   z+The fused kernel for %s took %.3f ms to run)r   rL   rA  r;  r   r  rI  r   rk  rL  r  get_argsr  r  r  	Exceptionr"   r   .disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr   n_spillsr.   	benchmarkr  r   )r   r  r  r  device_interfacer  r  r  r  r  r  r  r  r  r  s    `        @@@@@r   r  z+TritonScheduling.benchmark_codegened_moduleG  sf    4AGG4G4GH R	$##AGG$G$G$IJR	$ BJ, )4
*i[:Q  II7
 B~3<<'?R	$ R	$ R	$B <<>D88D#&;; (4)44d;A>? -66Iy>Q&&& |$$'995\<<> !**K! +==>Bk33F"6{ B
 II=
 Ms||#eR	$ R	$ R	$N  
(==OO		@
 5\3<<''cR	$ R	$ R	$N
(OR	$ R	$ R	$ R	$ R	$sh   .J"AI;<	J(I;8HCI;.	J	I8
AI3I8I; 	J3I88I;;J	 JJc                   |j                  d      }|xr  t        d |j                         D              }| j                  }|rddlm} |}|rd|d<   |j                  d      r
d|d	<   d|d<   t        j                  |j                        s|j                  d	      rJ d|d	<   t        j                  j                  ||||      } ||i |}| j                  |||      S )
Nr~  c              3  <   K   | ]  }|j                           y wr   )is_split_scan)r   r  s     r   r  z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>  s      (
%)D (
r  rN   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)contains_opry   scheduler_nodesr  triton_split_scanr  rD  rV  r3  r`  rL   r  triton_kernel_kwargsadd_multi_kernel_choices)	r   kernel_featureskernel_argskernel_kwargsis_scanr  r  r  r   s	            r   create_kernel_choicesz&TritonScheduling.create_kernel_choices  s     "--f5 
C (
-<-L-L-N(
 %
 +/*:*:@/K>CM:; &&v.=AM9:>CM:;11/2Q2QR$(()HIII=BM9:		66+}
 k;];,,V[-PPr   c           	        |g}t         j                  j                  s|S |j                  xr |j	                  d       }|j
                  xr |j	                  d       }|r%|j                   | j                  |i |ddi       |r|j                  j                  }t        j                  j                  j                  |d      r[|j                   | j                  |i |ddix}       |r2|j                  r&|j                   | j                  |i |ddd       t        |      dkD  r.|dd  D ]  }	|j                  |	_         |j!                  d        |S )	Nr  r  Fi   )r  r  rN   c                    | j                   S r   )rS  )r  s    r   rB  z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>  s    q'='= r   r  )r"   r   multi_kernelrS  r`  r  r  r  rL  r3  rL   rA  r:  r2  r   must_keep_buffersr  )
r   r   r  r  kernelsoptional_persistentoptional_cooperativer  r  kernel2s
             r   r  z)TritonScheduling.add_multi_kernel_choices  s    (.h}}))N$99 
-BSBS+C
 ?
  &;;  
MDUDU,E
 A
 NN    # 38  __44Fww44VUC-T--$' 8= E '5+E+ENN((((+ <A:?	 w<!"12; E,2,D,D)E LL=L>r   c                   fdfd}fd}dg }}d}t         j                  j                  }t        |      t         j                  _        t         j                  j                  }t        |      t         j                  _        t
        j                  dkD  }	t
        j                  dkD  }
| j                  |d|	|
d      }|D ]   \  }}}|D cg c]  }|j                          }}|D cg c]  }|D ]  }|j                           }}}|j                  t        t        j                        d      }t        j                   |      t"        j%                  d	|j&                          |       \  &|z  }|z  }|j)                  j&                         ܉j+                         j,                  j.                    j0                   d          j2                  }t5        |      d
k(  sJ |d   j6                  dkD  rt9        d      xnUt         j                  j;                         }t=        j>                  fd|      t=        j>                  fd|      t"        j%                  dt        d |D                      |        |z  }|z  }|j)                  j&                          |t         j                  _        |t         j                  _        |||fS c c}w c c}}w )Nc                 ~     j                   J t        j                  j                   j                         d   dz   S r  r  r  s   r   r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_path  s6    <<+++77##CLL1!4~EEr   c                             } t         j                  j                  |       rCt        |       5 }t	        d |j                         j                         D              cd d d        S y# 1 sw Y   yxY w)Nc              3  2   K   | ]  }t        |        y wr   )r  )r   r  s     r   r  zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>  s      Eaq Er  r  )r  r  r  r  r   r  splitr  s     r   r  z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cache  s^    "$Dww~~d#$Z F2  E2779??3D EEF FFs   .A,,A5c                 \            } t        | t              dz   t              z          y )Nr  r  )r  r  r  ms_clones    r   r  z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cache
  s&    "$Ds2w}s8}<=r   r   g        T)subkernel_nodescustom_part_algorithmenable_autotunemixed_sizesonly_gen_src_coder  r  rN   r  c                 4      j                     d         S rg  r  r  s   r   rB  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>B  r  r   r  c                 (     j                     d   S rg  r  r  s   r   rB  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>F  s    ;0;;TB1E r   zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc              3  <   K   | ]  }|j                           y wr   r  r  s     r   r  z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>L  s     <A1::<<r  ) rL   rA  r  r   inplaced_to_remover"   combo_kernels_autotunecombo_kernel_allow_mixed_sizesgenerate_combo_kernel_code	get_nodesr  rX  r   rA   r  r)   r  rk  rL  r  r  r  r  r  r  r  r   r  r  rI  r.   r  )r   	node_listr  r  total_ms	file_listtotal_clone_msremoved_buffers_originplaced_to_remove_origr*  r+  kernel_code_listr  r  
node_groupr  fused_node_listsr  r7  namesr  r  r  r  r  r  r  r'  r  s                         @@@@@@@r   benchmark_combo_kernelz'TritonScheduling.benchmark_combo_kernel  s   
	F	 	>  ) # ww66",-A"B"#''"<"<%/0G%H" 77!;;;a?::%"&+#" ; 
 (8 5	+#Ha=GHT 0HH/?OeOAQZZ\O\OEO''K,C,C(DiPH""8,CII7
 &<LB~B(*  .<<>D88D#&;;  0%00$7:;,66Iy>Q&&&|$$q( %e,X<<> !**K! '00E!
 IIV<<<	 MNHh&NS\\*k5	+l #7%<"22o  IOs   L 7L)r  zOptional[Scheduler]r   r   )r  ztorch.devicer   )   )r   tuple[float, str])r@  N)r  zOptional[OrderedSet[str]]r   rA  )r  rs   r  	list[Any]r  r  r   list[TritonKernel])r   rD  r  rB  r  r  r   rC  )r   r   r   rD  r  r   r   rQ   FOREACH	BUCKETIZEINPLACE_BUFFERSMASKED_SCATTER_WITH_INDEXSCANSORTTRITON_TEMPLATESTUPLE_REDUCTIONr  r`  r   r  r  r  r  r  r  r  r?  ri  rj  s   @r   r  r    s   )K)!""$$**44++**		
: $ $MB=~	
 RVW$5NW$	W$r#Q+#Q #Q &	#Q
 
#QJ33 3 &	3
 
3j\3r   r  c                   g }| j                         }|t        |t        j                        sJ |r0|j                  $|j                  | j                          d       |S ddlm} | j                         }|J | j                  j                  |      }t        |t        |f      sJ dt        |              t        j                  j!                  |      5  t"        j$                  }|j'                  | j)                               j+                         }|t"        _        d d d        |j                  | j                          d       |j                  t-        j.                  d             |S # 1 sw Y   RxY w)Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )get_template_noder   r#   MultiTemplateBuffermake_kernel_renderr  r  0torch._inductor.codegen.cuda_combined_schedulingrM  r  r  get_backendrf   r  rL   rA  set_current_devicer$   generated_kernel_countr  r4  r-  r  r  )r  linesmulti_templaterM  r  backendold_generated_kernel_counttriton_codes           r   r  r  Y  sd   E++-N!Z@V@V%WWW.;;C((JKL2 L/	
 "!!!..,,V4'N4J#KL 	
klpqxlykz{	
L WW''/ 	H *1)G)G&!AA eg  .HG*	H 	(67X__[&9:L	H 	Hs   A	E66E?)r   r  r  )r  r   r  r%  r  r%  r   r   r  )rI  rh  r   rh  )rI  rh  r   r   )r   zUnion[CSEVariable, Any]r   r   )r   ro   r  )r   r   r   zCallable[[_T], _T])r  r5   r   r  )
__future__r   r"  rr  r  r  r}  loggingr  r   r  r  abcr   collections.abcr   r   r   r   typingr	   r
   r   r   r   r   r   sympy.printing.precedencer   r   torch._loggingtorch.utils._pytreer%   _pytreer  torch._dynamo.device_interfacer   torch._dynamo.utilsr   r   torch._prims_commonr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._tritonr   r   r   utils._sympy.symbolr   r   r   r   utils._sympy.value_rangesr    r   r"   r#   r$   r  r&   	codecacher'   r(   r)   r*   rL  r+   ops_handlerr,   runtimer-   runtime.benchmarkingr.   runtime.hintsr/   r0   r1   r2   runtime.runtime_utilsr3   r4   r  r5   r6   r7   r8   shape_propagationr9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   virtualizedrI   r  rJ   rK   rL   wrapper_benchmarkrM   block_analysisrO   commonrP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   simdr`   ra   rb   rc   rd   re   rf   triton_utilsrg   rh   ri   rj   rk   rm  rl   typesrm   rl  ro   %torch.fx.experimental.symbolic_shapesrp   rq   rr   simd_kernel_featuresrs   rt   	getLoggerr   rk  _logginggetArtifactLoggerperf_hint_logschedule_log
fusion_logr   r   r   r   r   r   r  r   r  r  r  r  r  r  r	  r0  rJ  rM  rS  rU  rX  rZ  r\  rm  r  r  _initialize_pointwise_overridesr  r  r   r  r7  r   r   r>  rC  rD  r  r  r   r   r   <module>r     s   "         	   8 8  E E  0   $ $ C < 0 / K K  Y X 4 ) ) ( F F ; ( ' .  D W W 5    " C B B /    $    %  L>&8	Bg!00<H~~//*E^^--hA
,	6 6  4 $ 4 *Q, Q,h 
 
 
B ~ ~ ~B
 ?4 ? ?8 =, = =@++/+<P++6rQM rQj 	3
&8
;P 8(.bt
&k t
&n  / / 9G"O G"T$+ $+N N
 N
 N
b! !H # # #
%uS%S/-A'BBC 
 j j jZU4A:/0 U4Aph\3~ \3~r   