
    qi~                      d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: erd dl;m<Z< ddl0m=Z= ddl>m?Z? ddl@mAZA ddlBmCZC ddlDmEZEmFZFmGZG ddl1mHZHmIZImJZJ ddlKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZU ddlVmWZWmXZXmYZY ddlZm[Z[ dd l\m]Z]m^Z^m_Z_m`Z` dd!lambZbmcZc dd"ldmeZemfZfmgZgmhZhmiZi erd d#l;mjZjmkZkmlZl d d$lmmZm  ej                  eo      Zpej                  j                  eod%      Zsej                  j                  eod&      Ztej                  j                  eod'      Zu e`       j                  Zw e"g d(      Zxd?d@d)Zyej                   G d* d+             Z{ G d, d-e{      Z| G d. d/e{      Z}dAd0Z~ ed1e]e]2      Zej                   G d3 d4             Z G d5 d6e_e   ee         Z G d7 d8eI      Z ej                  d9:       G d; d<             Z G d= d>e      Zy)B    )annotationsN)Counter)AnyGenericOptionalTYPE_CHECKINGUnion)TypeVar)metrics)MultiTemplateBuffer)analyze_memory_coalescing)free_unbacked_symbols)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hashPyCodeCache)	MemoryDepStarDepWeakDep)CallableIRNode)!indexing_dtype_strength_reduction)CoordescTuner)DeviceProperties)
green_textlast_power_of_2yellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)
cache_property_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reductionsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernelSizeHintMultiKernel)DisableReductionEnableReductionNodeScheduleEntryNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequence)CoalesceVarAnalysis
perf_hintsschedulefusion)zyxr0_r1_c                j    t         j                  j                  j                  j                  }||S | S N)torch	_inductorr   triton	max_tiles)defaultr[   s     b/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/simd.pyget_max_tilesr^   ^   s-    &&--77I!-9:7:    c                       e Zd ZdZej
                  j                  ej
                  j                  d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZee	dd              Z
d	dZee	d
d              Z xZS )IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthc                   t         
|           || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y rW   )super__init__namevar_list
var_rangesnumelprefixrb   rc   kernelroot)selfrg   rh   ri   rj   rk   rl   rb   rc   rm   	__class__s             r]   rf   zIterationRanges.__init__s   sO     		 $
	r_   c                ,    t        | j                        S rW   )r5   rk   rn   s    r]   is_reductionzIterationRanges.is_reduction   s     #4;;//r_   c                ,    t        | j                        S rW   )r6   rg   rq   s    r]   symbolzIterationRanges.symbol   s    !$)),,r_   c                z    t        j                         D ci c]  \  }}||
 }}}|| j                     S c c}}w rW   )r   itemsrk   )rn   symtrk   prefix_to_symts       r]   rw   zIterationRanges.symt   s>     <F;K;K;MN<4&$,NNdkk** Os   7)rg   strrh   list[sympy.Symbol]ri   dict[sympy.Symbol, sympy.Expr]rj   
sympy.Exprrk   ry   rl   
SIMDKernelrm   IterationRangesRootreturnNoner   boolr   zsympy.Symbol)r   r   )__name__
__module____qualname____doc__sympySOnerf   propertyr0   rr   rt   rw   __classcell__ro   s   @r]   ra   ra   c   s    . ww{{ % 3	
    " 
0 0  0- +  +r_   ra   c                       e Zd ZdZ	 d
	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZddZddZ	 	 	 	 ddZ	ddZ
	 	 	 	 dd	Z xZS )r~   z
    Root of a iteration range tree that represents a single
    tiled dimension in the output kernel. It contains multiple
    sets of iteration represented with IterationRangesEntry.
    c          	         |i }t         |   |g i ||||        || _        i | _        || _        |r| j
                  r|	J || _        || _        |	| _        |
| _	        y )N)rg   rh   ri   rj   rk   rl   rm   )
re   rf   indexnodes	pid_cacherr   is_loop
tensor_dimgrid_dimhas_zdim)rn   rg   rj   rk   r   rl   r   r   r   r   r   ro   s              r]   rf   zIterationRangesRoot.__init__   s     I 	 	
 
=?
 *3
 t00X5EFF$  r_   c                <    d| j                   d| j                   dS )NzIterationRangesRoot(, z, ...))rg   rj   rq   s    r]   __repr__zIterationRangesRoot.__repr__   s    %dii]"TZZLGGr_   c                b    | j                   j                         D ]  }|j                           y rW   )r   valuescache_clear)rn   nodes     r]   r   zIterationRangesRoot.cache_clear   s*    JJ%%' 	D	r_   c                2    t        | j                   d      S )Nr   )r6   rk   rq   s    r]   	index_symzIterationRangesRoot.index_sym   s    !T[[M"788r_   c                   t         j                  j                  j                  ||z  | j                        rt        | j                         |      }nt        | j                         ||      }|| j                  vrt        | j                   t        t         j                  j                         ||||       }|t         j                  j                  |j                         <   | j                   j#                  |j                                || j$                  |j                         <   || j                  |<   | j                  |   S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r<   graphsizevarsstatically_known_equalsrj   r   r   r   r   IterationRangesEntryrk   nextrl   iter_vars_countrange_tree_nodesrt   rh   appendri   )rn   rb   rc   exprr   s        r]   lookupzIterationRangesRoot.lookup   s     7733Gf4DdjjQDNN,g6D"4>>#3WfEDtzz!';;-QXX%=%= >?@D 8<AHH%%dkkm4MM  /-3DOODKKM*#DJJtzz$r_   c                    t         j                  j                  }g }t        |      D ](  }|j	                  | j                  ||             ||z  }* g t        |      S rW   )r   r   r   reversedr   r   )rn   lengthsrb   itervarsrc   s        r]   construct_entriesz%IterationRangesRoot.construct_entries   s]     ''++w' 	'FOODKK89&G	' %(#$$r_   c                f    | j                  |      D cg c]  }|j                          c}S c c}w rW   )r   rt   )rn   r   es      r]   	constructzIterationRangesRoot.construct   s'    $($:$:7$CDq
DDDs   .c           
     \  	
 dd|j                   D cg c]+  }t        j                  j                  j	                  |      - }}|D cg c]!  }|s|j
                  | j
                  k(  s |# }}|j                  fd       t        j                  j                  g 	g 
	
fd}|D ]v  }t        j                  j                  j                  |j                        s8 || j                  t        |j                                     |j                   ||       x t        j                  j                  j                  | j                         s, || j                  t        | j                                      g t#        	      g t#        
      fS c c}w c c}w )z,Figure out vars from this tree used in indexc                   t         j                  j                  j                  | j                  t
        j                        }t         j                  j                  j                  | j                  t
        j                        dk(  }|| fS )a:  
            Gets the key for sorting nodes. When two nodes have the
            same divisor, the node with length as 1 should be handled
            first so the current divisor is not changed after multiplied
            node.length. Returns `not length_is_one_hint` for ascending
            sort.
            fallbackr=   )r<   r   r   	size_hintrb   r   unbacked_symint_fallbackrc   )rS   divisor_hintlength_is_one_hints      r]   get_sort_keyz8IterationRangesRoot.vars_and_sizes.<locals>.get_sort_key   s     77++55		F$C$C 6 L   **HHv'F'F +    !&8"899r_   c                     |       S rW    )rS   r   s    r]   <lambda>z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>  s    a r_   keyc                    j                  | j                                j                  | j                         | j                  z  y rW   )r   rt   rc   )r   rb   
index_varssizess    r]   addz/IterationRangesRoot.vars_and_sizes.<locals>.add  s5    dkkm,LL%+Gr_   )rS   r   r   ztuple[int, bool])free_symbolsr<   rl   r   getrk   sortr   r   r   r   r   r   rb   r   r   rj   r   )rn   r   sr   nr   r   rb   r   r   r   s          @@@@r]   vars_and_sizesz"IterationRangesRoot.vars_and_sizes   sK   
	:& <A;M;MNa**..q1NN!CqQ188t{{+BCC

0
1''++
	,  	D77##;;DLL'RDKK$,,)HIJ,,I	 ww77

GLGXdjj'%BCD&*%&(:(5/(:::/ OCs   0F$F)F)/F)rW   )rg   ry   rj   r|   rk   ry   r   intrl   r}   r   Optional[dict[str, str]]r   r   r   Optional[int]r   r   r   r   r   r   r   ry   r   r   r   )rb   r|   rc   r|   r   r   )r   list[sympy.Expr]r   zlist[IterationRangesEntry])r   r   r   rz   )r   r|   r   z+tuple[list[sympy.Symbol], list[sympy.Expr]])r   r   r   r   rf   r   r   r   r   r   r   r   r   r   s   @r]   r~   r~      s     /3)!)! )! 	)!
 )! )! ,)! )! ")!  )! )! 
)!VH9 .%'%	#%E/;/;	4/;r_   r~   c                  p     e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
dZddZddZd
dZddZddZ	ddZ
 xZS )r   c                $   t         |   ||j                  |z  |j                  |j                  |j
                  |||j                  |j                  	       || _         t        j                  d       | j                        | _        || _        y )N)	rg   rj   rh   ri   rk   rb   rc   rl   rm   )re   rf   rj   rh   ri   rk   rl   rm   parent	functools	lru_cache_codegencodegenr   )rn   rg   rb   rc   r   r   ro   s         r]   rf   zIterationRangesEntry.__init__-  s~     	,,'__((==== 	 
	
 0y**40?	r_   c                    d| j                    d| j                   d| j                   d| j                   d| j                   dS )NzIterationRangesEntry(r   ))rg   rb   rc   r   ri   rq   s    r]   r   zIterationRangesEntry.__repr__D  sH    &tyykDLL>DKK=PRSWS\S\R]]_`d`o`o_ppqrrr_   c                L    fd| _         d | j                   _        | _        y )Nc                      S rW   r   )rg   s   r]   r   z/IterationRangesEntry.set_name.<locals>.<lambda>H  s    t r_   c                      y rW   r   r   r_   r]   r   z/IterationRangesEntry.set_name.<locals>.<lambda>I      r_   )r   r   rg   )rn   rg   s    `r]   set_namezIterationRangesEntry.set_nameG  s    ##/ 	r_   c                8    | j                   j                          y rW   )r   r   rq   s    r]   r   z IterationRangesEntry.cache_clearL  s      "r_   c                X    t         j                  j                  |        | j                  S rW   )r<   rl   codegen_iteration_ranges_entryrg   rq   s    r]   r   zIterationRangesEntry._codegenO  s    	//5yyr_   c                   g }t        | j                  t        j                        r|S t        | j                  t        t
        f      sJ t        | j                               | j                  j                  dd  D ]l  }t        |t        j                  t        j                  f      r.|j                  }t        |      dkD  sIt        d |D              s\|j                  |       n |S )Nr=   r   c              3  P   K   | ]  }t        |t        j                           y wrW   )r   r   SIZE.0r   s     r]   	<genexpr>z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>\  s       ,56N1dii0,   $&)
isinstancer   r   Symbolr   r   typeargsIntegerr   lenallr   )rn   precomputed_argsargsymbolss       r]   r   z%IterationRangesEntry.precomputed_argsS  s    -/dii.##$))h%@AR4		?RA99>>!"% 	1CcEMM5<<#@A**w<!# ,:A, ) %++C0	1  r_   c                ,    t        | j                        S rW   )hashrg   rq   s    r]   __hash__zIterationRangesEntry.__hash__b  s    DIIr_   c                X    t        |t              sJ | j                  |j                  k(  S rW   )r   r   rg   )rn   others     r]   __eq__zIterationRangesEntry.__eq__e  s&    %!5666yyEJJ&&r_   )rg   ry   rb   r|   rc   r|   r   r|   r   ra   r   r   r   )rg   ry   r   r   r   )r   r   r   r   )r   objectr   r   )r   r   r   rf   r   r   r   r   r   r   r   r   r   s   @r]   r   r   ,  sf      	
    
.s
# 'r_   r   c                    | t        d      k(  ry| t        d      k(  ryt        j                  |       ryt        |       S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)values    r]   constant_reprr  j  s9    e	%-		E	;r_   CSEVariableType)boundr\   c                  ,    e Zd ZU ded<   ded<   ded<   y)PartialAccumulatery   buffer_namereduction_typer   r  N)r   r   r   __annotations__r   r_   r]   r  r  w  s    Jr_   r  c                  N    e Zd ZU dZeZded<   ded<   dZded<   ded	<   	 	 	 	 	 d<	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d= fd
Zd>dZ	d Z
eed?d              Zd@dZdAdZedBd       ZdCdZ	 	 	 	 	 	 	 	 	 	 	 	 dDdZdEdZdFdZdGdZdCdZdCdZdHdZd?dZd>dZdIdZdBdZdBdZdJdZ	 	 	 	 	 	 dKdZ	 	 	 	 	 	 dKd Z dLd!Z!dMd"Z"e#	 	 	 	 	 	 dNd#       Z$e%e&jN                  jP                  f	 	 	 	 	 	 	 dOd$       Z)e%e&jN                  jP                  f	 	 	 	 	 	 	 dPd%       Z*	 	 	 	 dQd&Z+e%	 	 	 	 	 	 dRd'       Z,dSd(Z-dSd)Z.dTd*Z/	 	 	 	 dJd+Z0dUd,Z1dVd-Z2dWd.Z3d/ Z4	 dX	 	 	 	 	 	 	 dYd0Z5e6jn                  	 	 	 	 	 	 dZd1       Z8d[d2Z9e#d3        Z:d\d4Z;d5 Z<d6 Z=d7 Z>d8 Z?d9 Z@d: ZAd]d;ZB xZCS )^r}   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFr   allow_block_ptrry   kernel_namec                ~    |i }t                    | _        |j                          _        t                _        t                _        |j                         D 	ci c]/  \  }}	|t        j                  j                  j                  |	      1 c}	} _        g  _        i  _        t!        j"                          _        |j'                          _        ||n j+                          _        | _        | _        ||n j3                          _        | _         j9                          _        d  _        t!        j"                          _        d _         tB        jD                  jF                  r j                  jH                  D ]h  }
tK        |
tL        jN                        stK        |
jP                  tR        jT                        sC|
jP                  jW                         dk(  sad _          n tX        jZ                  d fd       }| _.         j_                  |       d _0        g  _1        y c c}	}w )NFdotTc                    t         j                  j                  j                  | j	                               } j
                  D ]  }j                  | |      }  j                  |       S rW   )r<   r   r   simplify_with_rangesri   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r   treern   s     r]   simplify_indexingz.SIMDKernel.__init__.<locals>.simplify_indexing  sb    GG$$99%ARSE(( B44UDAB 66u==r_   r   )r   r|   )2re   rf   featuresget_mutations	mutationsr3   bodyindexing_coderv   r<   r   r   simplifynumelsr  r   	itertoolscountr   rr   inside_reduction should_use_cooperative_reductioncooperative_reductiontiling_scorestilingshould_use_persistent_reductionpersistent_reductionmix_order_reductionwant_no_x_dimno_x_dimr   store_output_ctris_native_matmulr   rZ   native_matmulnode_scheduler   r   SchedulerNoder   r   ComputedBufferget_reduction_typer   cacher  initialize_range_treersplit_sizesaved_partial_accumulate)rn   r%  r  r   override_persistent_reductionoverride_cooperative_reductionr$  r(  rk   valr   r  ro   s   `           r]   rf   zSIMDKernel.__init__  s    I !//1"$	+-FLlln
7BvsFAGG$$--c22
 79JL(0 ( 5 5 7 .9 +668 	"
 ?L-3 -8 *557 	!
 *= **,(, ) 1 %==&&33 tY%<%<="499b.?.?@		446%?,0D) 
	> 
	> "3""9-AC%a
s   "4H9c                    d| dS )Nz<STORE_OUTPUT_>r   )rn   is     r]   _get_store_output_subgraph_namez*SIMDKernel._get_store_output_subgraph_name  s    s!$$r_   c                n    t        | j                        }t        j                  |dz
  d      | _        |S )Nr=   )startstep)r   r+  r  r   )rn   totals     r]   get_store_output_countz!SIMDKernel.get_store_output_count  s.    T**+ )eaia Hr_   c                :    t        d | j                  D              S )Nc              3  2   K   | ]  }t        |        y wrW   )r5   )r   rk   s     r]   r   z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>  s     I6&v.I   )sumr  rq   s    r]   num_reduction_dimszSIMDKernel.num_reduction_dims  s     IT[[IIIr_   c                    t         rW   NotImplementedError)rn   dtypes     r]   dtype_to_strzSIMDKernel.dtype_to_str      !!r_   c                6    | j                   j                         S rW   )r  select_index_dtyperq   s    r]   get_index_dtype_as_torch_dtypez)SIMDKernel.get_index_dtype_as_torch_dtype  s    }}//11r_   c                @    | j                  | j                               S rW   )rK  rO  rq   s    r]   index_dtypezSIMDKernel.index_dtype  s      !D!D!FGGr_   c                     yNFr   rq   s    r]   r)  zSIMDKernel.want_no_x_dim      r_   c                   t        fdt        D              }| xs | }d	d}g d}	t        t        |	            }
ddg}|r|}n
|r|
}n|
|z   } |||      } ||	t              }g }t	        |      D ]s  \  }}t        |      }|j                  |      }|j                  |      }||n|}|j                  t        | d|   ||| ||xr | j                   ||dv 
             u |S )
Nc              3  ,   K   | ]  }|v s|  y wrW   r   )r   rk   r  s     r]   r   z3SIMDKernel.construct_range_trees.<locals>.<genexpr>  s      %
61AF%
   	c                `    t        fd| D              D ci c]  \  }}||
 c}}S c c}}w )Nc              3  ,   K   | ]  }|v s|  y wrW   r   )r   r8  masks     r]   r   zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>  s     2U3PT32UrW  )	enumerate)seqrZ  idxr8  s    `  r]   filtered_index_mapz<SIMDKernel.construct_range_trees.<locals>.filtered_index_map  s3    )22U#2U)U%S#S  s   *)rS   rR   rQ   rT   rU   r   rQ   )r   r   r   r   r   )r   zdict[Any, int])
r   all_prefixeslistr   r[  r5   r   r   r~   r'  )rn   r   r!  rr   r  r*  active_prefixesno_r_dimr^  	grid_dimspointwise_tensor_dimsreduction_dimstensor_dimstensor_dim_mapgrid_dim_mapr  r;  rk   r   r   r   s       `                r]   construct_range_treesz SIMDKernel.construct_range_trees  s3    % %
!-%
 
 (';|+;	
 $	 $Xi%8 9(K/K/.@K ,KI))\B"?3 	IAv.v6L'++F3J#''/H!)AxE#he$6N'(J1J1J-J)% F]	& r_   c                    | j                  || j                  | j                  j                         | j                  | j
                        }| j                  j                  |       y rW   )ri  r!  r  rr   r  r*  r  extend)rn   r   r  s      r]   r3  z SIMDKernel.initialize_range_tree  sR    00!!MM&&(KKMM
 	,r_   c                     y)zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nr   )rn   indicess     r]   finalize_indexingzSIMDKernel.finalize_indexing'  r   r_   c                v    | j                   }d| _         	 | j                  |||      || _         S # || _         w xY wrS  )r!  store)rn   rg   r   r  priors        r]   store_reductionzSIMDKernel.store_reduction-  s;    %% %	*::dE51$)D!ED!s   / 	8c                     yrS  r   rq   s    r]   r"  z+SIMDKernel.should_use_cooperative_reduction5  rT  r_   c                     yrS  r   rq   s    r]   r&  z*SIMDKernel.should_use_persistent_reduction8  rT  r_   c                t    t        t        j                  j                  d | j                  D                    S )Nc              3  P   K   | ]  }|j                   j                            y wrW   )ri   rv   r   r  s     r]   r   z(SIMDKernel.var_ranges.<locals>.<genexpr>=  s"      *,0%%'*r   )dictr  chainfrom_iterabler  rq   s    r]   ri   zSIMDKernel.var_ranges;  s4    OO)) *484D4D* 
 	
r_   c                :    t        d | j                  D              S )Nc              3  J   K   | ]  }t        |j                  d u        y wrW   )r   r   rw  s     r]   r   z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>C  s     Q3td23Qs   !#)rE  r  rq   s    r]   triton_tensor_ndimzSIMDKernel.triton_tensor_ndimB  s    Q@P@PQQQr_   c                ^    dg| j                         z  }d||<   ddj                  |       dS )Nr   :[r   ])r}  join)rn   r;  r   s      r]   indexing_size_strzSIMDKernel.indexing_size_strE  s9    42244a499U#$A&&r_   c                    dg| j                         z  }| j                  D ]R  }|j                  |j                  r| j                  s)|j
                  j                          d||j                  <   T |S )N1BLOCK)r}  r  r   rr   r!  rk   upper)rn   r   r  s      r]   dense_size_listzSIMDKernel.dense_size_listJ  sx    //11$$ 	GD& $$(=(=,0KK,=,=,?+@)Fdoo&	G r_   c                    |j                   }|j                  | j                         }| d| dS dg| j                         z  }d||j                  <   dj	                  |      }| d|j                          d| d}|S )	Nzmask = tl.full(z, True, tl.int1)r   r  r   zmask = tl.full([zBLOCK], True, tl.int1)[r  )rk   r   dense_size_strr}  r  r  )rn   entryrS   sizestrr   suffixouts          r]   create_constant_maskzSIMDKernel.create_constant_maskU  s    LL#))+GSy0@AA42244"%e5!#AGGI;.EfXQO
r_   c                L    | j                         }ddj                  |       dS )Nr  r   r  )r  r  rn   r   s     r]   r  zSIMDKernel.dense_size_str`  s)    $$&499U#$A&&r_   c                   t        |t              s|S |j                  d   }| j                  j	                  |      x}|S t        |||j                  i      }t        j                  j                  j                  |      }t        ||j                  j                         |j                  j                  t        j                  j                   |j                  j"                        j%                         i      S Nr   )r   r   r   r   r   r8   r   r<   r   r   r  rm   r   r   r   r   r   rj   rt   )rn   r   rS   	tree_node	new_indexs        r]   r  z)SIMDKernel.combine_modular_indexing_pairsd  s    %1LJJqM..22155I>Luq)..&9:	GG$$CCIN	((*INN,A,AGGKK!5!5-&(
 	
r_   c                    t         j                  j                  j                  |      x}r!|\  }}t	        | j                  ||      |      S | j                  ||      S rW   )r<   r   r   expand_floor_divr   _combine_contiguous_dims)rn   r   r  
expand_resr  denominators         r]   r  z"SIMDKernel.combine_contiguous_dimsv  s[     ))::5AA:A%/"I{D99)TJKXX00==r_   c                   t        |t        j                  t        j                  f      r|S |j	                  |      \  }}t        |      dk  r|S t        j                  j                  j                  ||t        |g||            \  }}}||k(  r|S |j                  |      }t        |t        t        | ||                        }	|	S )zI
        More aggressive simplification to merge contiguous dims
        r=   )r   r   r   r   r   r   r<   r   r   _simplify_loopsr@   r   r8   rx  zip)
rn   r   r  r   r   	new_sizesreindex_prunenew_index_varsr  s
             r]   r  z#SIMDKernel._combine_contiguous_dims  s     eemmU\\:;L //6
Eu:?L%&WW%5%5%E%E7US&
"	7F L	2ud3z7>;R+S&TU	r_   c                      j                   d   j                  xs  j                  t        j                   fd       } |       S )Nc               3     K    j                   j                         s j                  rJ d  y r j                          d _        	 d  r j                          d _        y # d _        w xY ww)NFT)r  rr   r!  codegen_body)rn   should_flushs   r]   ctxz)SIMDKernel.disable_reduction.<locals>.ctx  sl     ==--/0000 !!#$)D!-%%'(,%%s   AA5A) !A5)	A22A5)r  r   r#  
contextlibcontextmanager)rn   r  r  s   ` @r]   disable_reductionzSIMDKernel.disable_reduction  sE    ''+33Qt7Q7Q		"	"	- 
#	-$ ur_   c                    t        |      t        | j                        k(  sJ t        || j                        D cg c]  \  }}|j                  |       c}}S c c}}w rW   )r   r  r  r   )rn   r   rc   rangess       r]   
set_rangeszSIMDKernel.set_ranges  s]    7|s4#3#34444 #&gt/?/?"@
 V$
 	
 
s   Ac                
   t        d |D              r| D cg c]  }g  c}g fS t        j                  j                  | D cg c]  }g  c}| D cg c]  }j	                  |       c}t        j                         dfd}	 	 	 	 	 	 dd}g }d}|D ]0  }	g }
|	D ]  }j                  |d      r|
j                  d        )|t              k  r>j                  |   d      r)|dz  }|t              k  rj                  |   d      r)t              dk(  xr d   dk(  }|d	z   t              k  rj                  ||   |dz      z        r||rzj                  ||   |dz      z        st        |   }|dz      }t        |||z        }|
j                   |||g |||       ||dz   |       ||d	z   |      g             8|dz   t              k  rj                  ||         sj                  t        ||         d      r]j                  ||         st        |   }t        ||         }|
j                   ||g |||       ||dz   |      g             |t              k  s|
j                  t        j                   |||                    |j                  |
       3 t        d
 D              sJ d d|        |fS c c}w c c}w c c}w )Nc              3  8   K   | ]  }t        |      d k(    ywr   Nr   )r   rc   s     r]   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>  s     6Fs6{a6s   c                    j                  |      }j                  |    |      st        t        |    |      | <   |    j	                  |       t              S rW   )r  statically_known_multiple_of	CantSplitr   r   r   )r;  r   
new_ranges	remainingsv	var_counts     r]   	add_rangez5SIMDKernel._split_iteration_ranges.<locals>.add_range  sZ    ;;t$D229Q<F#IaL$7IaLqM  &	?"r_   c                P     t              t               dz   k(  sJ d fd}|S )z
            Builds the nested expression:
              ((...((s1*v[i1] + v[i2]) * s2 + v[i3]) ... ) * sk + v[i(k+1)])
            r=   c                \    | d      }t        dd        D ]  \  }}||z  | |   z   } |S )Nr   r=   )r  )	flat_varsr   r   r]  idxsr   s       r]   getterzISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter  sH     a)!%ab2 5FAst8in4D5r_   )r  r   r   r|   r  )r   r  r  s   `` r]   make_combinedz9SIMDKernel._split_iteration_ranges.<locals>.make_combined  s+     t9E
Q... Mr_   r   r=   c                6    t         j                  j                  S rW   )r   r   Zero)_s    r]   r   z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>  s    EGGLL r_      r  r   c              3  t   K   | ]0  }t         j                  j                  j                  |      d k(   2 yw)r=   Nr<   r   r   r   r   s     r]   r   z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr><  s*     I!177##--a0A5Is   68zfailed to set ranges  )r;  r   r   r|   r   r   )r   r   r  z	list[int]r   z(Callable[[list[sympy.Expr]], sympy.Expr])r   r<   r   r   r  r  r   r   r   r   statically_known_gtr  r  r   operator
itemgetter)groupsr   groupr  gr  r  return_getters_groupscurrent_grouplength_groupreturn_getterssizeis_bmm_then_pwsize1size2size3r  r  r  r  s                   @@@@r]   _split_iteration_rangesz"SIMDKernel._split_iteration_ranges  s    6g66$*+5B+R//WW:@-AQb-A
-34R[[^4	OO%		# 	#	#	+4	5	" !## ]	9LN$ Z--dA6"))*@A#c)n49S9Sm,:
 "Q&M $c)n49S9Sm,:$ "%Y1!4!K2!9K!A%I6..i6=STCT9UU ' ::i6=STCT9UU (%m4E%ma&78E$T55=9E"))%"EN )- ? )-!*;U C )-!*;U C	 #Q&Y7**4=1IJ **8D)M:R+SUVW ::i6 (%m4E$T9]+CDE"))%"G )- ? )-!*;U C %s9~5&--$//	-0NOqZv "((8{]	9~ IyII 	
#I;ay9	
I 000K , .B4s   	K6	K;L c                   t         j                  j                  }t        |d         dk(  r\|j	                  |t
        j                  j                        s2|j	                  t        |      t        |d         |z        r|d   |gfS |S )z1Fill in the reduction numel of lengths if missingr=   r   )	r<   r   r   r   r   r   r   r   r7   )clsr  r   reduction_numelr   s        r]   prepare_split_iteration_lengthsz*SIMDKernel.prepare_split_iteration_lengthsA  s{     77##wqz?a00%''++N00f%gaj)O;
 AJ 122r_   c                n    | j                  |||      }	 | j                  ||       y# t        $ r Y yw xY wNTF)r  r  r  )r  r  r   r  s       r]   is_compatiblezSIMDKernel.is_compatibleU  sB     55fgW	''8 		s   ( 	44c                >   | j                   D ci c]  }|j                  |j                   }}| j                  s0|D ]+  }t	        |      st
        j                  j                  ||<   - g |j                         }| j                  ||| j                        S c c}w )a5  
        Split and set iteration ranges for the kernel based on the provided lengths.

        This method maps the kernel's tiling structure to the node's iteration space,
        handling both pointwise and reduction dimensions appropriately.

        Args:
            lengths: A sequence of sequences of symbolic expressions representing
                    the sizes of different dimensions for each node.

        Returns:
            A list of lists of symbolic expressions representing the mapped
            iteration variables for each dimension.
        )r  rk   rj   r!  r5   r   r   r   r   map_kernel_groups_to_node_sizesr  )rn   r   rtr%  rk   r  s         r]   split_and_set_rangeszSIMDKernel.split_and_set_rangesd  s    $ 150@0@A""))RXX%AA $$  1&v.%*WW[[F6N1
 $6==?# 33FGT__UU Bs   Bc           
     F   t        |      t        |      k(  r!t        d t        ||      D              r || S | j                  ||      \  }}g t        j
                  j                   ||       }|D cg c]  }|D cg c]
  } ||       c} c}}S c c}w c c}}w )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c              3     K   | ]?  \  }}t         j                  j                  j                  t	        |      |z
        d k(   A ywr  r<   r   r   r  r7   )r   rS   r  s      r]   r   z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>  s@      /
1 GG%%mA&6&:;q@/
s   AA)r   r   r  r  r  ry  rz  )	r  r  r   r  r  r  r   fnsfns	            r]   r  z*SIMDKernel.map_kernel_groups_to_node_sizes  s    & w<3v;&3 /
GV,/
 ,
 w'',/,G,GPW,X)
)LY__22:z3JKL8MN,"H,NN,Ns   7	B BBBc                6    t        |t        j                        S rW   )r   r   TMPrn   r   s     r]   is_indirect_indexingzSIMDKernel.is_indirect_indexing  s    "5$((33r_   c                   | j                  |      rydgt        | j                        z  }|j                  D ]g  }|| j                  vr| j                  |   }t        |j                  t              sJ ||j                  j                  xx   |j                  z  cc<   i t        j                  j                  j                  t        fdt        || j                  j!                               D              S )NFr=   c              3  F   K   | ]  \  }} |       |      k7    y wrW   r   )r   	idx_range
iter_ranger  s      r]   r   z,SIMDKernel.is_broadcasted.<locals>.<genexpr>  s,      
%	: Y8J#77
   !)r  r   r  r   r   r   r   r~   r   rc   r<   r   r   r  anyr  r   )rn   r   index_numelsrt   r  r  s        @r]   is_broadcastedzSIMDKernel.is_broadcasted  s    $$U+sS--(( 	=FT222))&1Eell,?@@@++,<,	= 77##,, 
),\4;;;M;M;O)P
 
 	
r_   c                    t        |t              r)ddj                  t        | j                  |             dS | j                  | j                  |            S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r  r   r  )r   r`  r  mapindex_to_strr  rename_indexingr  s     r]   r  zSIMDKernel.index_to_str  sN     eT"tyyT%6%6!>?@BBzz$..u566r_   c                n   | j                  |      }t        |t        j                  j                  j
                        }t        |j                  t        j                              s(t        |j                  t        j                              r3|j                  t        j                  j                  j
                        }t        |j                  t        j                              r|j                  t        j                        D ]g  }|j                  }t        |      dkD  st        d |D              s1|t        j                  j                  j                  |      i}t        ||      }i | j                  |      }t        |t               s|n|j"                  d   }| j%                  |      S )Nr   c              3  p   K   | ].  }t        |t        j                  t        j                  f       0 y wrW   )r   r   r   PRECOMPUTED_SIZEr   s     r]   r   z.SIMDKernel.prepare_indexing.<locals>.<genexpr>  s.      , #1tyy$2G2G&HI,s   46)r  r8   r<   r   r   precomputed_replacementsr   atomsr   floorceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)rn   r   ar   replacements
simp_indexs         r]   prepare_indexingzSIMDKernel.prepare_indexing  sG    &&u-5!''"2"2"K"KLu{{5;;'(CEMM0J,KJJqww//HHIE u{{5==)*[[/ 	< ..w<!# ,$, ) %&qww'7'7'O'OPQ'R#SL&ul;E	< ++E2
 )X>JJOOTUDV 	 $$Z00r_   c                p    | j                   D cg c]  }|j                  r| j                  r| c}S c c}w rW   )r  rr   r!  )rn   ts     r]   active_range_treeszSIMDKernel.active_range_trees  s9     %%
>>T%:%: 
 	
 
s   !3c                4   t         j                  j                  j                  || j	                               }t        |j                  t              D ]  }|| j                  v si }| j                  |   j                         D ].  }t         j                  j                  j                  |      ||<   0 t        |      dkD  r5t        | j                  |   j                  |      | j                  |   _        | j                  |   j                           |S )Nr   r   )r<   r   r   r  ri   sortedr   ry   r   r   r  r   r8   r   r   )rn   r   symr  pss        r]   r   zSIMDKernel.codegen_indexing  s    ww44T4??;LM$++5 	5Cd+++  "//4EEG TB'(ww'7'7'O'OPR'SL$T|$q(6@--c277$7D))#.3 %%c*224	5 r_   c                    t        d      )NzNYI: codegen_nan_checkrH  rq   s    r]   codegen_nan_checkzSIMDKernel.codegen_nan_check	  s    !":;;r_   c                    t         j                  j                  }t        | j                  j
                        D ]  }|j                  |        y rW   )r<   r   wrapper_coder   r   workspace_argsgenerate_workspace_deallocation)rn   wrapperwss      r]   deallocate_workspacesz SIMDKernel.deallocate_workspaces  s=    ''&&499334 	8B33B7	8r_   c                    t        d      )NzNYI: call_kernelrH  )rn   rg   r   deallocate_wss       r]   call_kernelzSIMDKernel.call_kernel  s     ""455r_   c              #     K   | j                   }| j                  }|rt        j                  ||      }t	        j
                  |      }|| _         || _        	 | || _         || _        y# || _         || _        w xY ww)z:Context manager to add an additional mask to tl.load/storeN)
_load_mask_load_otherr:   logical_andr;   _unwrap)rn   rZ  r  rq  	prior_vals        r]   
mask_loadszSIMDKernel.mask_loads  sy     
 $$	??4/D!!$' 	)J#DO(D $DO(Ds   AA=A* A=*A::A=c                (   | j                   j                         D ci c]  \  }}||j                   }}}t        ||      }i }| j                  D ]7  }t        |j                        }t        ||di      t        ||di      z
  ||<   9 |S c c}}w )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        r=   r   )r   rv   r   r8   r  r6   rg   )	rn   r   kvindex_to_tile_indexesindex_in_tile_varsstrides
range_treer   s	            r]   get_strides_of_loadzSIMDKernel.get_strides_of_load*  s     8<7L7L7R7R7T Utq!AFF U U'/DE** 	J":??3A#$6A?*"QFC GAJ	
  !Vs   Bc                \    t        |t              rt        t        | |            S  | |      S rW   )r   tupler  )r  r  s     r]   _map_tuple_or_scalarzSIMDKernel._map_tuple_or_scalarB  s'    eU#R((%yr_   c                    t        j                  | j                  j                        D cg c]  }|j	                          }}t        t        d |            S c c}w rW   )rH   
only_nodesr  r.  estimate_flopsrE  filter)rn   r   flopss      r]   r,  zSIMDKernel.estimate_flopsH  sX     +55dmm6Q6QR
 !
 
 6$&''	
s   Ac           	     P   g }t        t        | j                  j                  j	                                     }| j                  j                         \  }}}}| j                  j                         }t        j                  j                  j                  t        | j                  j	                               t        j                        }t!        |      D ]B  \  }}||vr|j#                  d       t        j                  j%                  |      }	t        j                  j                  j                  |	t        j                        }
|
|kD  rwt'        t(                  }d}||   D ]M  }t+        |t,        t.        f      r|j1                  d|        |dz  }3|j1                  |j2                         O t        |      |z  }n|
}t        j                  j5                  |      }t7        |      }|j#                  ||z  dt9        ||k        z   z         E t;        |      S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   r   no_index_dep_r=   )r   r9   r   inplace_buffersr   python_argdefsr  buf_accessesr<   r   r   r   r7   r  r   r   r[  r   	get_numelr   r   r   r"   r#   r   r   	get_dtyper2   r   rE  )rn   nbytesninplace_argsr  	call_argsr3  	out_numelr;  r   	arg_numelbuf_sizerm  no_index_dep_countdeprj   rJ  
dtype_sizes                    r]   estimate_kernel_num_bytesz$SIMDKernel.estimate_kernel_num_bytesO  s    F499#<#<#C#C#EFG!YY5579a}}113 GG$$..$++,,./44 / 
	  	* 	MFAs ,&a ))#.Iww''11F$C$C 2 H )# %S/+%&"', /C!#'9:m4F3G$HI*a/*CII./ Gy0 GG%%c*E'.JMM%*,CM8I4J0JKL?	M@ 6{r_   c           	     &   t        | j                  j                        dk(  rEt        | j                  j                        dk(  r#t        | j                  j                        dk(  ry| j                  j                         \  }}}}d}|D ]F  }t        j                  j                  |      }|s&|j                         }	t        |	j                        dk(  sOt        |	j                  D 
cg c]
  }
|
dk(  s	|
 c}
      dk(  r|t        j                  |	j                        }||}||k7  st        d| dd| d	| z         }t        j!                  |       |D cg c]m  }t        j                  j                  |      rJt        j                  t        j                  j#                  |      j                         j                        ndo }}|D cg c]Z  }t        j                  j                  |      r7t        j                  j#                  |      j                         j                  nd\ }}|D cg c]@  }|t        j                  j$                  v rd
n|t        j                  j&                  v rdndB }}|D 
cg c]  }
|
j(                   }}
t        d| d| d| d| d| dz         }t        j!                  |        y t+        d| d      }t        j!                  |       yc c}
w c c}w c c}w c c}w c c}
w )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r=   r   Nr  r   zExpected stride order z, but found stride orderr  z for kernel 
GraphInputIntermediateBufferz  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   input_buffersoutput_buffersr1  r2  r<   r   try_get_buffer
get_layoutr  r   get_stride_orderstrider,   logwarning
get_buffergraph_inputsname_to_bufferrg   r*   )rn   r  argdefsr8  
_signaturer  uniform_stride_orderarg_namebuflayoutrS   stride_ordermsgrg   stride_order_list	size_listsource_listargdef_namess                     r]   warn_mix_layoutzSIMDKernel.warn_mix_layout  s    		''(A-DII,,-2DII--.!3
 ,0II,D,D,F)J#! 0	H''((2C^^%F6;;1$6;;9a!q&9:a?!226==A'/+7()\9%01E0FF^_l^<}EFC KK$ %.) ! 7711$7 ++GG..t4??AHH "	")% ) %.	! ! 7711$7 **40;;=BB!"!I ! %.# !	  177#7#77 %  177#9#99 2!	"#K # 5<#<qAFF#<L#<%(nYK|\m[no&ykk]"MNC KK$a0	b 3K=@TU
 	C[ :)!# $=s'   -
K:
8K:
"A2K?AL?AL	
Lc                   t        j                  ||d|      }d| _        t        j                  | j                  j
                  |      }t        j                  ||      }d| _        t        j                  ||      }t        j                  ||      }t        j                  ||d|      }t        j                  |||f      S )NrE  FT)r:   	reductionr!  
index_exprr  r  truedivsubmulr;   r  )	rn   rJ  r  sum_rnumelmeandxdx2m2s	            r]   welford_reduce_fallbackz"SIMDKernel.welford_reduce_fallback  s    }}UE5%8 % = =uE{{4( $WWUD!ggb"o]]5%4!!4V"455r_   c                    t        j                  ||d|      }t        j                  ||      }t        j                  |      }t        j                  ||d|      }t	        j
                  ||f      S )NmaxrE  )r:   r]  r`  expr;   r  )rn   rJ  r  vmaxr`  rk  vsums          r]    prepare_softmax_twopass_fallbackz+SIMDKernel.prepare_softmax_twopass_fallback  s\    }}UE5%8ggeT"ggcl}}UE5#6!!4,//r_   c                    t         rW   rH  rq   s    r]   codegen_kernelzSIMDKernel.codegen_kernel  rL  r_   c                     y rW   r   rq   s    r]   r  zSIMDKernel.codegen_body      r_   c                     y rW   r   )rn   r  s     r]   r   z)SIMDKernel.codegen_iteration_ranges_entry  rr  r_   )NNNNF)r%  dict[str, sympy.Expr]r  rI   r   r   r6  Optional[bool]r7  ru  r$  Optional[dict[str, sympy.Expr]]r(  r   r   r   )r;  r   r   ry   r   )rJ  torch.dtyper   ry   )r   rw  r   r   )r   r   r!  r   rr   r   r  rt  r*  r   r   list[IterationRangesRoot])r   zdict[str, str]r   r   )rm  Sequence[sympy.Expr]r   r   )rg   ry   r   r|   r  r?   r   r   )r   r{   )r   z	list[str])r   r|   r   r|   )r   r|   r  r~   r   r|   )r   z'contextlib.AbstractContextManager[None])r   r|   r   rz   )r  Iterable[sympy.Expr]r   Sequence[Sequence[sympy.Expr]]r   zStuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]])r  rz  r   r{  r  r|   r   r{  )r  rz  r   r{  r  r|   r   r   )r   r{  r   list[list[sympy.Expr]])r  ry  r   r{  r   r|  )r   r|   r   r   )r   r|   r   ry   )r   rx  )r   r|   r   r|   r   )NT)rg   ry   r   zOptional[IRNode]r  r   r   r   )rZ  zUnion[str, OpsWrapper]r  Union[int, float]r   zIterator[str])r   r|   r   r{   )r   r   )r  r   )Dr   r   r   r   pexprr  r	  r  rf   r<  rA  r   r0   rF  rK  rO  rQ  r)  ri  r3  rn  rr  r"  r&  ri   r}  r  r  r  r  r  r  r  r  r  staticmethodr  classmethodr   r   r   r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r&  r)  r,  r?  r[  rh  rn  rp  r  r   r   r   s   @r]   r}   r}   ~  s    */E&.&&!OT! /38<9=9=$)AD%AD %AD ,	AD
 (6AD )7AD 7AD "AD 
ADF%
 J  J"2 H H5+5 5 	5
 &5 5 
#5n-*
R'
		'
$>>':>	>':	(0
 L1$L1/ML1
L1 L1\ 
 ',ggkk	$ 0 $	
 
( & 
 ',ggkk	$ 0 $	
 
  V5 V	 VD O$O 0O
 
 O O84
,7$1$1 
$1L
"<8 OS66/6GK6	6
 )*)3D)	) )&0  
(CJFP
60"r_   r}   c                     e Zd ZU dZeZded<   d Zd ZeZ	eZ
d Zd Zd Zd	 Z	 d-	 	 	 d.dZd Z	 d/	 	 	 d0dZ	 	 d1dZe	 	 	 	 	 	 d2d       Zd3dZ	 	 	 	 d4dZd ZdddZ	 	 	 	 d5dZd6dZ	 	 	 	 	 	 d7dZdd
d	 	 	 d8dZd Z	 d9	 	 	 	 	 	 	 	 	 	 	 d:dZd Ze  e!jD                  d      d;d              Z#e 	 	 	 	 	 	 d<d        Z$e 	 	 	 	 	 	 d=d!       Z%e 	 	 	 	 	 	 	 	 d>d"       Z&e 	 	 d?d#       Z'e 	 	 	 	 	 	 	 	 	 	 d@d$       Z(e 	 	 	 	 	 	 	 	 dAd%       Z)e 	 	 	 	 	 	 	 	 dBd&       Z*e e+jX                  jZ                  d
f	 	 	 dCd'       Z.e e+jX                  jZ                  d
f	 	 	 dDd(       Z/d) Z0dEd*Z1	 dF	 dGd+Z2d, Z3y
)HSIMDSchedulingzo
    Single Instruction Multiple Data parent class used for fusion across
    multiple different backends.
    z	type[Any]kernel_typec                &    t        d |D              S )Nc              3     K   | ]6  }t         j                  j                  j                  t	        |             8 y wrW   r  r   s     r]   r   z*SIMDScheduling.group_fn.<locals>.<genexpr>  s*     PQQWW%%..}Q/?@Ps   <>r(  r  s     r]   group_fnzSIMDScheduling.group_fn  s    P%PPPr_   c                X	   t        |t        j                        st        |t        j                        r t        j                  j                  ||      S |j                  \  }\  }|j                  \  }\  t        ||      }|j                         r)|j                         s|j                         rA |d       n8|j                         r(|j                         s|j                         r |d       |j                         r|j                         r|k(  xr k(  }|sddlm	} |j                  ||      }|s |d|       |r|j                         s|j                         ra|j                         s||}}| j                  |j                         |      t        fd|j                         D              s	 |d       y|S |j                         s|j                         s|k(  rk(  s|j                         s |d|       y|j                         D ]`  }|j                         r nN|j                         |j!                         z  s7|j                  \  }\  }	}
||	k(  r|
k(  rT |d	||	|
        y ||fD ]  }|j                         s y
 | j                  |j                         |      }| j                  |j                         |      }| j                  |j                         |j                         z   |      }t"        j$                  j&                  rVd
}t)        |      dkD  r%t)        |      dkD  r||cxk(  xr |k(  nc }n||k(  }nt)        |      dkD  r||k(  }|s |d|||       yy
|j                         s|j                         rɉdk(  rdk7  sJ |z  k(  rt        fd|j                         D              s	 |d       yt"        j$                  j*                  r\|j                         sLt-        | j                  |j                         |      j/                               |dfdffv }|s |d       |S y
|k7  r |d       |k(  S |j                         r|j                         rJ | j1                  ||      S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsr   )MixOrderReductionz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)c              3     K   | ]8  }t         j                  j                         |j                                 : yw)r  N)r}   r  r   
get_ranges)r   n2rnumel1r%  s     r]   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr><  s>        ,,' - s   >Az/invalid loop order and tiling for native matmulFz5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)z:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s)Tr   ztiling mismatch (%s, %s, %s)r=   c              3  j   K   | ]*  }t         j                  f|j                                , y wrW   )r}   r  r  )r   r   numel2rnumel2s     r]   r   z*SIMDScheduling.can_fuse.<locals>.<genexpr>  s1       ,,fg->Os   03z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ForeachKernelSchedulerNodecan_fuser  r/   is_split_scanrr   torch._inductor.schedulerr  r,  select_tiling	get_nodesr   is_templateused_buffer_namesget_buffer_namesr   rZ    tiling_prevents_pointwise_fusionr    tiling_prevents_reduction_fusionr(  r   can_fuse_horizontal)rn   node1node2r  numel1whyreduction_can_fuser  r   	pro_numel
pro_rnumelr   tiling1tiling2tiling3condis_reduction_tiling_validr  r  r  r%  s                    @@@@r]   r  zSIMDScheduling.can_fuse  s    eYAABj977G
 77@@NN${{FG${{FGu% )<)<)>!!#<=  "5+>+>+@!!#<=E$6$6$8!'6!1!Hg6H%G%6%?%?u%M"%G "&&(E,B,B,D --/#(%5E ++EOO,=vwO  $oo/	  IJ %%!!#E,>,>,@f$G);((*O ! !& 1 )++-!  $557%:P:P:RR$59ZZ22Iz &) 3:8M \ & ) ' * $)#)& U^  ==? 
 (():FGLG(():FGLG((!EOO$55vwG }}==w<!#7|a'&'<W<&'1\A%"g-D6	 !!!#(:(:(<a<GqL00')) "__.  <= MMBB!--/05**5??+<fELLN1  !,1- 5:;4412V##!!#E,>,>,@@@''u55r_   c           
     z   g t        t        j                            t               t               d fd}fd}fd}fd}t        j                  fd       }fd}	|D ]  }
|
v rj                  |
        ||
      r? |	|
      r |       5  	 d d d        r ||
      sxs t              nd  ||
       ` ||
      r" |       5  j                  |
       d d d        t        d d d	|
j                  d
           S # 1 sw Y   |xY w# 1 sw Y   xY w)Nc                b    | j                   \  }\  }}|k(  xr |k(  xs |z  k(  xr |dk(  S Nr=   r  r   r  
node_numelnode_rnumelrj   rc  s       r]   fits_in_main_bodyz@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_body  sH    +,77(A(
K%'AK6,A efn,A1Ar_   c                N    | j                   \  }\  }}|k(  xr |dk(  xr dk7  S r  r  r  s       r]   fits_outside_reductionzESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reduction  s4    +,77(A(
K&K;!+;K!Kr_   c                \    | j                   j                  D ]  }|j                  v s y yr  )read_writesreadsrg   )r   readcurrent_loop_buffer_usages     r]   expect_improved_memory_usagezKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usage  s1    ++  99 99  r_   c                   j                  |        j                  |        j                  | j                  j                  D cg c]  }|j
                   c}       | j                         rt        | t        j                        rrt        | j                  t        j                        rNt        | j                  j                  t        j                        s j                  | j                                y j                  | j                  j                   D cg c]  }|j
                   c}       y c c}w c c}w rW   )r   r   updater  r  rg   rr   r   r   r/  r   r   r0  dataScanget_namewrites)r   rS   r  doner.  not_ready_yet_nodess     r]   schedule_node_in_loopzDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop  s    HHQK  #%,,amm>Q>Q-Raff-RS
  q)"9"9:qvvr'8'89"166;;8#''

5)00!--BVBV1WQ!&&1WX .S 2Xs   D; E c               3  L  K   rd   t         u rj                          nj                  t               r1j	                  t               j	                  dz   t                d d  j                  t                j                           j                          y w)Nr  r=   )rF   popr   rE   insertclear)r  maybe_split_indexr.  r  s   r]   end_current_reduction_loopzISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loop  s      r!2o!E!!#$$%56 $$%68HI$$%6%:OL$(!  1%%'%++-s   B!B$c                    dk(  ry| j                   z  sy|rt        |d   t        t        f      rJ t	              S )Nr=   Fr  )	ancestorsr   rF   rE   r   )r   r.  r  rc  s     r]   #requires_closing_previous_reductionzRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reduction  sN    {&7 b!O5E#F*   +,,r_   zunexpected group: (r   z) != r=   )
r   r   r-   r  r  r   r   r   rI  r  )rn   r   rj   rc  r  r  r  r  r  r  r   r  r  r  r.  r  s     ``       @@@@@r]   generate_node_schedulez%SIMDScheduling.generate_node_schedule  sW   #%)5568 0:|5?\!+/		L		Y" 
	"	"	. 
#	.	-  	Dt|HHTN &6t]K35  -5QRV5W(9(OS=O% )-%%d+'-/1 /!((./ / *)%6(%

1O -	4 ' / /s   (D%&D1%D.	1D:	c                    |j                   |j                  }}|j                  |j                         z  s|j                  |j                         z  rJ | j	                  ||       y rW   )r  r  r  get_operation_names_codegen_mix_order_reduction)rn   r   r  r  s       r]   codegen_mix_order_reductionz*SIMDScheduling.codegen_mix_order_reduction	  s[    zz4::u OOe&?&?&AAOOe7799	
 
 	))%7r_   c                    |j                         }g }g }|D ]5  }|j                         r|j                  |       %|j                  |       7 ||fS rW   )r  rr   r   )rn   r   r   
reductions	epiloguess        r]   #_split_mix_order_reduction_epiloguez2SIMDScheduling._split_mix_order_reduction_epilogue  s\     
	 	'D  "!!$'  &		'
 9$$r_   c           	        |j                   |j                  }}|j                  }| j                  |||dg|dddd      d   }|j                  sJ |j
                  sJ ||_        | j                  ||       |j                  j                  t        |j                        |j                  d   z  |j                  d   |j                  z   dz
  |j                  z  z  d	t        j                  
      \  }}	}
|
dk(  s
J d|
       |5  |j                          ddd       t!        j"                         }t%        j&                  |      5  |5  |r%|j)                  t+        j,                  d             |j/                         }ddd       ddd       |r)j1                  t3        t4        j6                        d      }||	fS # 1 sw Y   xY w# 1 sw Y   MxY w# 1 sw Y   QxY w)z
        for_benchmark:
            True if the generated code is for benchmarking. We need make
            sure benchmark harness code is generated.
        )rS   rT   NT)r  r$  r(  r6  r   rT   rS   r=   F)rJ  zws_off=)benchmark_kerneltriton_)rj   r  r.  create_kernel_choicesr'  r(  r4  !codegen_node_schedule_with_kernelr   	workspacer   r5  r  rX   r   r  r  	ExitStackr<   set_kernel_handlerenter_contextr   patchrp  replacery   r4   KERNEL_NAME)rn   kernel_features
split_sizefor_benchmarkrj   rc  r.  rl   r  ws_namews_offstacksrc_codes                r]   -_generate_kernel_code_for_mix_order_reductionz<SIMDScheduling._generate_kernel_code_for_mix_order_reduction  s    (--/N/Nv'55++()+!%'+15		
 	 ****))))'..}fE $[[22//0mmE"#c"V%7%77!;@R@RRT ++ 3 
7F {(wviL({ 	"!	" $$&!!&) 	/5 	/##FLL$$GH,,.H	/ 	/
 
  ''K,C,C(DiPHw((	" 	"	/ 	/ 	/ 	/s0   
GG8GGGG	GG(Nc                    t         rW   rH  )rn   modn_spills_threshold
node_namess       r]   benchmark_codegened_modulez)SIMDScheduling.benchmark_codegened_moduleS  s
     "!r_   c                     t         j                  j                        \   }t        j                  j
                  j                  t        j                   |            s j                  |      S  fd} |       }t        xj                  dz  c_        t        j                  j
                  j                  t        j                   |            sJ  j                  |      \  }}g }|D ]C  }	|	j                          |	j                         }
|
j                          |j!                  |
       E  j#                  j%                         |z    |      }t'        | |      t(        j*                  j,                  j.                  sqt,        j0                  j2                  Wt,        j0                  j4                  s t,        j6                  st,        j8                  r fd}t;        j<                  ||d      } j?                  |d      \  }}}tA        |d   jB                  jD                        }i }|rZ|D ]  }	|	jG                         d   jB                  jI                         }|	jG                         d   jJ                  d   jB                  jG                         d   jB                  jI                         }|||<    j                   sJ  j                   jL                  jO                  |	jG                         d   jJ                  d   jB                  jI                                t        j                  jP                  jO                  |        |jR                  D ]-  }|jU                  |jV                  |jV                        |_+        /  jY                  |||      }||_-        t]        |      |_.        t        j^                  |      5  ja                         D ]@  }|jG                         d   jB                  jI                         |vs1|jc                          B 	 d d d        t        j                  jd                  jg                  d        ji                  |d        |jk                  |jZ                  d	       t        j                  xjP                  |jP                  z  c_(        t        j                  xjl                  |jl                  z  c_6        to        |      to        |jR                        k(  sJ t        j                  jd                  jq                   |z   dz
  |z        }ts        |jR                        D ]  \  }}|jV                  }| d
| }| d
| }d| d| }ddd}|jU                  |jt                  |jt                        }t        j                  jd                  jw                  | d| d| d| d| d| d| d       t        j                  jd                  jx                  jO                  |        |j{                          |r j}                  |        j                          y # 1 sw Y    xY w)Nc                 r   t         j                  j                  t         j                  j                  S t        j                  j                               } | j                  }|dz  }t        j                  j                  j                        }t        t        ||z        d      }t        |d      }|S )N         )r   rZ   mix_order_reduction_split_sizer)   create
get_devicemulti_processor_countr<   r   r   r   rj  r+   min)device_propnum_smestimated_num_splits
numel_hintr  r  rj   s        r]   _pick_split_sizezESIMDScheduling._codegen_mix_order_reduction.<locals>._pick_split_size^  s    }};;G}}CCC +11%2B2B2DEK 66F#)A:  ))33E:J_Z;O-OPRTUJZ-Jr_   r=   c                    j                  | d      \  }}}t        j                  |      }j                  |      \  }}|S )NTr  r  )r  r    loadr  )candidate_split_sizer  r  r  msr  rn   s        r]   _benchz;SIMDScheduling._codegen_mix_order_reduction.<locals>._bench  sS    !%!S!S#3"& "T "1h
 "&&x077<A	r_   r  Fr  r   z!# Call mix order reduction kernel)r  z * (z + 1) * aminamax)r  rj  z = r  z : z].view(r   z).z(dim=0))@r   r  get_numel_rnumelr<   r   r   evaluate_exprr   Gtr  r   r  r  cancel_reduction_splitextract_pw_from_reductionswap_pw_red_dimensionr   r  r  rI   rX   rY   r   deterministicrZ   r  'mix_order_reduction_autotune_split_sizemax_autotunecoordinate_descent_tuningr(   autotune_single_fieldr  r   r   _split_sizeget_outputsr  usersremoved_opsr   removed_buffersr5  r   r  define_kernelr  r   r  scheduler_nodesmark_runr  make_commentcodegen_commentr  inplaced_to_remover   codegen_python_sizevarr[  r  	writeline	allocatedr  _codegen_nodesfree_buffers_in_scheduler)!rn   r  r  rc  r  r  node2_reductionsnode2_epilogueconverted_nodessubnode	convertedr.  r  rl   r  r  is_split_reductionrenamebufnameusernamepartial_accumr  r   nsplitr]  r  
stride_strr>  endreduction_type2opopnamer  rj   s!   ``                             @@r]   r  z+SIMDScheduling._codegen_mix_order_reductionX  s   !33DDUKvww--ehhuf.EF44UEBB	  &'
 	++q0+ww--ehhuf.EFFF ,0+S+S,
(. ' 	.G**,99;I++-""9-		.
 33OO/
 -]E6J &&44<<DEE&&33 '<<J %)$V$V! %W %
! ""21"5":":"F"FG+ 5!--/277@@B'')!,U1T++-+ T((*	  #+w~~%~**..'')!,2215::CCE ''++G45 "(!@!@ ,2JJ!--}/H/H-)
 ((=&I($X.!!&) 	$'779 $ ##%a(--668FMMO$	$ 	
))*MN]D16--UC	6#9#99	""f&?&??" ?#s6+J+J'KKKK%%<<Z!#
2
 #,F,K,K"L 	<C'33K"83vh/Je3zl+EcU(:,/C! '**,,m.J.JF GG  **-s7)1UG3se76("VHTVW]V^^ef
 GG  **..{;'	<* 	$$&/&&(]	$ 	$s   AY
Y

Yc                b   | j                   sJ |D cg c]+  }|j                         | j                   j                  vs*|- }}|sy t        |d       j                  \  }\  }}| j                  |||      }t        j                  d|       | j                  t        ||||            S c c}w )Nc                4    t        | j                               S rW   r   rr   rS   s    r]   r   z/SIMDScheduling._codegen_nodes.<locals>.<lambda>  s    c!..:J6K r_   r   zSchedule:
 %s)
r   r  r  rj  r  r  schedule_logdebugcodegen_node_schedulerI   )rn   r   coalesce_analysisr   r  rj   rc  r.  s           r]   r  zSIMDScheduling._codegen_nodes  s    
 ~~~"
dmmoT^^=W=W&WD
 
  ,KLRR?E633E5&I+];))}eV=NO
 	

s
   +B,B,c                   | j                   sJ |j                         D cg c]*  }|j                         | j                   j                  vr|, }}t	        |      dk(  ryt
        j                  j                  j                  j                  r_t	        |      t	        j                               k7  r.| j                   sJ t        j                  | j                   |      }t        |      }nd}| j                  ||      S c c}w )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        r   N)r   r  r  r  r   rX   rY   r   rZ   coalesce_tiling_analysisFusedSchedulerNoder   r  )rn   r   r   r7  s       r]   codegen_nodezSIMDScheduling.codegen_node  s     ~~~ (
}}dnn&@&@@ 
 

 u:???!!((AA5zS!122~~%~ 33DNNEJ 9$ ? $""5*;<<!
s   /Dc                   t        j                  t         j                        j                  }t	        |       sy|D cg c]0  }|j                         r|j                         j                         2 }}|D ]}  }|j                         rt        |t        j                        s/|j                         }||D cg c]0  }|j                         r|j                         j                         2 c}z  } t        d |D              syt        j                  j                  j!                  | |       |D ],  }t        j                  j                  j!                  ||       . yc c}w c c}w )NFc              3  2   K   | ]  }t        |        y wrW   )r1   )r   r  s     r]   r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>=  s     FD)$/FrD  T)rX   iinfoint32rj  r1   has_tensor_outputrG  storage_sizer   r   MutationOutputget_mutation_buffersr   r<   r   r   	check_leq)rj   buffersint_maxrS  	buf_sizesmutated_bufsr  s          r]   can_use_32bit_indexingz%SIMDScheduling.can_use_32bit_indexing   s=    ++ekk*..%e, 
$$& NN))+
	 
  	C((*z#r?P?P/Q"779+,,. NN$113 		 FIFF 	
""5'2 	6DGG&&tW5	6/
s   5E&;5E+c                t   |j                   }| j                  ||j                  |j                  |j                        \  }}| j                  ||g||d      }|D ]  }| j                  ||        t        j                  |       |D ]p  }t        j                  |      5  |j                         }ddd       | j                  ||      }t        j                  d|       ||_        t!        |      |_        r ~t#        |      dkD  rt        |      }	n|\  }	t        j                  |	      5  |j%                         D ]  }
|
j'                           	 ddd       |D 
cg c]  }
t)        |
t*              s|
 }}
| j-                  ||	j                         t.        j0                  j2                  r\t        j4                  j6                  j9                          t        j4                  j6                  j;                  |	j                  |       |	j=                  |	j                         t.        j0                  j2                  r(t        j4                  j6                  j?                          t.        j@                  r|	jC                          t.        jD                  r|	jE                  |d   j                         t        j4                  xjF                  |	jF                  z  c_#        t        j4                  xjH                  |	jH                  z  c_$        t        j4                  j6                  jJ                  rt.        jL                  r|d   jN                  jQ                         }|j%                         D ]  }
|
jS                         }||vr|
jT                  J |
jT                  jW                         }|CtX        d   dxx   dz  cc<   t        j4                  j6                  j[                  d|j\                  d	| d
        | j_                          y# 1 sw Y   kxY w# 1 sw Y   xY wc c}
w )z<
        Generate code for nodes in kernel_features
        )r  r$  Nz+Generating kernel code with kernel_name: %sr=   r   inductorintermediate_hookszrun_intermediate_hooks(r   r   )0r.  get_tiling_and_scoresrj   r  r7  r  r  rC   merge_workspaces_inplacer<   r  rp  r  rJ  r5  r  r   r   r  r  r   r-   r  r   cppenable_kernel_profiler   r   write_kernel_context_guard_beginwrite_kernel_context_guardr  write_kernel_context_guard_endnan_assertsr  r[  r  r  supports_intermediate_hooksgenerate_intermediate_hooksr   live_output_buffersr  r   get_origin_noder   r  rg   r   )rn   r  r.  r%  tiling_scorekernelsrl   r  r  final_kernelr   base_scheduler_nodes	live_outsrg   origin_nodes                  r]   r6  z$SIMDScheduling.codegen_node_scheduleG  s    (55#99!!++--	 
 ,,H(<H

  	JF22=&I	J,,W5 	3F%%f- 3!0023,,X}fMKIIC[Q!,F(2F	3  w<!&w/L%O\!!,/ 	 '779   	  + 
j?P.QD 
  
 	1<3K3KL::++GG  AACGG  ;;(($ 	  !9!9:::++GG  ??A**,!!(()?)?@	<#?#??	""l&E&EE" GG  <<22  
;;=I'779 
}}y(yy,,,"ii779*Z()=>!C>GG((221+2B2B1ERvQO
 	&&(y3 3	  	 
 
s$   !P0&P(#P59P5P%	(P2c                (     | j                   |i |gS rW   )r  )rn   r  kernel_argskernel_kwargss       r]   r  z$SIMDScheduling.create_kernel_choices  s)     D
 	
r_   c           	     <   |5  t        j                         }i }|D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          D|j                          |j                  |j                               }|j                  t        j                  |j                  j                  |      j                                       |j!                  |j#                                |D ]  }|t        u r |j                  |j	                                +|t
        u r|j                          Dt%        |j                         |j                  |j                               }|j'                  |        	 d d d        y # 1 sw Y   y xY wrW   )r  r  rE   r  r  rF   closedecide_inplace_updater  r  r  rx  fromkeys_bodyindexing_from_argsr   rn  keysr'   r   )rn   r.  rl   r  all_indexingr   r   s          r]   r  z0SIMDScheduling.codegen_node_schedule_with_kernel  sS    	-((*EL & ++''(@(@(BC_,KKM..0!'!<!<T__=N!OJ '' JJ99*ELLN $$\%6%6%89 & 	-++''(@(@(BC_,KKM 6djjA!'!<!<T__=N!OJLL,	--	- 	- 	-s   FFFFonly_gen_src_codec               l	   i }|j                         }g }	|D ]  }
|
j                         }|	j                  |
       ||z  s*t        |      dk(  sJ |	|t	        t        |            <   |j                  j                  t	        t        |                   g }	 t        |	      dk(  sJ |5  |s|g|D ]  }|j                            |       }|j                         }t        |      D ]  }|j                  |      }|j                  |      5  |D ]0  }|j                  |j                  |j                                      2 |j                   j#                  t%                      ddd        |j&                  j)                         D ]+  \  }}d| d}|j+                  |j-                         g       x}	s0t/        d |	D              }t1        j2                  d|       5  |j                  |      5  |	D ]  }t        |j                               dk(  r<t        |	      dk(  r.t5        |      r#|xj6                  |j                         z  c_        |j                  |j                  |j                                       |j                   j#                  t%                      ddd       ddd       . 	 ddd       t9        j:                  |      5  t=        t>              s`t@        jB                  jE                  |jF                  jH                        5  |jK                  d       ddd       |jK                  d	d
       |j&                  D ]  }d| d}|jK                  |d
        |j                         }t        |      D ]$  }|j                  |      }|jK                  |       & t=        |t>              r|}n|jM                         }g |||}t0        jN                  rH|jQ                         dz  }|jS                          d| d|jU                  |      jW                          }|r|cddd       S | jY                  |||      |_-        |cddd       S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   pxY w# 1 sw Y   yxY w)zK
        Helper method to codegen a single template kernel variant
        r=   r   Nz<LOAD_INPUT_r:  c              3  <   K   | ]  }|j                           y wrW   )can_codegen_without_upcasts)r   p_ns     r]   r   z:SIMDScheduling._codegen_single_template.<locals>.<genexpr>  s      5>A7795   ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eArC  ).r  r  r   r   r   iterprologue_fused_inputsr   r  rA  ranger<  set_subgraph_bodyr   r  r  cse
invalidater   named_input_nodesrv   r   r  r   r   r  r   #prologue_fused_inputs_preserve_zeror<   r  r   ry   r   r&   current_originsr   originsfinalize_hookfinalize_remainingr  r?  imports_for_benchmark_kernelcodegen_kernel_benchmarkgetvaluer  r  )rn   rl   rendertemplate_nodeepilogue_nodesprologue_nodesrk  buf_name_to_prologue_grouptemplate_readsprologue_groupprologuenamesr   partial_codenum_store_subgraphsr;  subgraph_name
input_namebuffercan_codegen_without_upcastprologue_noder  r.  num_gbs                           r]   _codegen_single_templatez'SIMDScheduling._codegen_single_template  s    &("&88:& 	$H--/E!!(+~%5zQ&@N*4U+<=,,00d5k1BC!#	$ >"a''' /	@$ +<^< $DMMO$ "8L"("?"?"A./ 8 & F Fq I--m< 8 . UV%@%@AR%STUJJ))*,78 88 '-&>&>&D&D&F @"
F".zl! <%?%C%COO%r& >  25 5ES5 2.  7=W9W @ $55mD @1? "$'(F(F(H$IQ$N(+N(;q(@'CM'R(.(R(R,9,J,J,L)*(R !. 5 5$*$?$?(5(@(@(B%&!"" #JJ11*,?!@@ @@#/	@p !!&) *	lC0YY..}/A/A/I/IJ ? ..~>?**;u*E %66 H
".zl! <**=*GH
 #)"?"?"A./ : & F Fq I**=9:
 ,,' (::<MnMmMnMM&&99;cA::<=Rj66v>GGIJL  !M*	 *	P "&!3!3HmV!TFU*	 *	Y8 8&@ @@ @9/	@ /	@v? ?*	 *	s   +A$RAQ)(AR<)R%R7B1Q6	(R0RAR*R/DR*R*)Q3.R6R ;RRRRR'	"R**R3c                   ddl m fdg }t        |j                        |gz   D ]S  }t	        |t        t
        f      r$|j                  t        fd|D                     =|j                   |             U t        |      S )Nr   r%   c                    t        |       sy t        | t        j                        r| j                         } | j	                         x}y t        d |D              S )Nc              3      K   | ]  }|  y wrW   r   r   s     r]   r   zKSIMDScheduling._get_multikernel_shapes.<locals>.get_size.<locals>.<genexpr>S  s     )q)s   )r   r   BaseViewunwrap_viewmaybe_get_sizer(  )r   r  r&   s     r]   get_sizez8SIMDScheduling._get_multikernel_shapes.<locals>.get_sizeL  sR    c6*#r{{+oo'**,,5)D)))r_   c              3  .   K   | ]  } |        y wrW   r   )r   _argr  s     r]   r   z9SIMDScheduling._get_multikernel_shapes.<locals>.<genexpr>X  s      @D$ @s   )r   r&   r`  inputsr   r(  r   )rn   r   r  r   r&   r  s       @@r]   _get_multikernel_shapesz&SIMDScheduling._get_multikernel_shapesG  st     	 	* $v- 	*C#e}-

5 @C @@A

8C=)		*
 Szr_   c                H    | j                  |      }t        d |D              S )Nc              3  @   K   | ]  }t        d  |D                yw)c              3     K   | ];  }t        |t        j                        xr t        |t        j                          = y wrW   r   r   Exprr   r   s     r]   r   zFSIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>.<genexpr>`  s9       1ejj)N*Q2N.NNs   AAN)r  )r   shapes     r]   r   z<SIMDScheduling._kernel_has_dynamic_shapes.<locals>.<genexpr>_  s,      

 	   
s   )r  r  )rn   r   shapess      r]   _kernel_has_dynamic_shapesz)SIMDScheduling._kernel_has_dynamic_shapes]  s.    --d3 

  
 
 	
r_   c                N    | j                  |      }t        fd|D              S )zk
        Returns cache key for hint-based multi-graph; key is tuple of shapes with hint filled in.
        c              3  F   K   | ]  }t        fd |D                yw)c              3     K   | ]<  }t        |t        j                        rt        |t        j                        sn| > y wrW   r  )r   r   hints     r]   r   zASIMDScheduling._make_shape_cache_key.<locals>.<genexpr>.<genexpr>o  s@        a,Z5==5Q s   AANr  )r   r  r  s     r]   r   z7SIMDScheduling._make_shape_cache_key.<locals>.<genexpr>n  s/      
    	 
r  )r  r(  )rn   r   r  r  s     ` r]   _make_shape_cache_keyz$SIMDScheduling._make_shape_cache_keyg  s1     --d3 
  
 
 	
r_   rk  hint_overridec          	        |j                   \  }\  }}|dk(  sJ t        |j                  t              r|j                  j                  rt        |j                  j                        dkD  r| j                  |j                        ri }	g }
|j                  j                  j                         D ]  \  }} ||j                  |      \  }}|r;| j                  |||||d      }t        |t              sJ |
j                  |       Z|]| j                  |||||d      }|dn| j                  |j                  |      }||	|<    |rdj                  |
      S t        j                  t        |	j!                                      t#        |	      }g |||}| j%                  ||j&                         |j)                  |j&                         t*        j,                  xj.                  |j.                  z  c_        t*        j,                  xj0                  |j0                  z  c_        | j3                          y|j                  j5                  |j                  |      \  }}|r| j                  |||||d      S | j                  |||||d      }g |||}| j%                  ||j&                         |j)                  |j&                  |j                         t*        j,                  xj.                  |j.                  z  c_        t*        j,                  xj0                  |j0                  z  c_        | j3                          y)z
        Codegen a triton template with multi-kernel dispatch support

        If `only_gen_src_code=True` the src code will be returned instead of being
        codegenned into the wrapper
        r=   )r  Trj  NFz

)r  r   r   r   _make_kernel_rendersr   r  rv   r  ry   r   r  r  rC   rN  r`  r   rD   r  r  r  r<   r   r  r  r   make_kernel_render)rn   r  r  r  rk  r  r  _numelrc  rZ  	src_codesr   r  rl   r  r  shape_cache_keymulti_kernelr.  s                      r]   codegen_templatezSIMDScheduling.codegen_templatex  sD     ,11FF{{ }))+>?""77M&&;;<q@//0B0BCGI
 ##88>>@$6 "!3!&&m" %#<<%&&*.  =  H &h444$$X. ( !::%&&*/ ; F %, !778J8JIV $
 06GO,I$6L !{{9--00gnn6F1GH.w7LMnMmMnMM  0H0HI$$\%=%=>GG##|'C'CC#GG&&,*I*II&**,*//BB""- C NFF !44!""&* 5   66!""&+ 7  !R. Q- Q. Q$$]F4F4FG""6#5#5}7I7IJ''6+A+AA'**f.G.GG*..0r_   c                    t         j                  j                  j                  t         j                  j                  j                                y rW   )r<   r   r  r  
device_opssynchronizerq   s    r]   codegen_synczSIMDScheduling.codegen_sync  s-    	&&qww'9'9'E'E'GHr_   c           
     x   ddl m} |D cg c]  }|j                          }}i i }
}	t        ||      D ]u  \  }}t	        |d       j
                  \  }\  }}| j                  |||      }| j                  |||      }||||f|
|<   |j                  |t        |||      |       |	|<   w |j                  || ||	|
      }t        j                  dt        |      |D cg c]  }t        |       c}       g }|D ]#  }t        |      dk(  r |||	      }|D ]  }| j                  |
|   d   |j                  |	|                |	|   }|
|   d   }|sIt!        j"                  |      5  t%        j&                  |      D ]  }|j)                           	 d d d        t         j*                  xj,                  |j,                  z  c_        t         j*                  xj.                  |j.                  z  c_         |j1                         }|j3                  |||f       & |S c c}w c c}w # 1 sw Y   xY w)
Nr=   )ComboKernelc                4    t        | j                               S rW   r2  r3  s    r]   r   z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>      #ann>N:O r_   r   )r  optimize_mask)r   triton_schedulingcustom_algorithm
kernel_mapnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsr   )enable_autotunemixed_sizes)triton_combo_kernelr  r  r  rj  r  r  r  create_triton_kernelrI   horizontal_partitionrJ  r5  r   r  create_sub_kernelr<   r  rH   r+  r  r   r  r  rp  r   )rn   subkernel_nodescustom_part_algorithmr  r  rk  r  r   fused_node_listssubkernel_mapnode_schedule_mappnr   r  rj   rc  r.  r%  
partitionspkernel_code_list
node_grouprl   	subkernelr  s                            r]   generate_combo_kernel_codez)SIMDScheduling.generate_combo_kernel_code  sm    	59HIDNN,II+-r(_.>? 		IB!$U0O!P!V!VAv 77ufMM''ufEF$165&$Hb! + @ @+M5&I"-o !A !M"		 !55!"2$+ 6 

 			? '(SV(	

 $ 	DJ:!#  /'F
 ! K66%b)!,,,]2->? *"-	 1" 5a 8(--i8 ,$6$A$A-$P ,D MMO,, ''9+D+DD'**i.J.JJ*K ,,.H##Xvz$BC/	D0  e J. )&, ,s   H& H+-+H00H9c                   |j                         }|j                  }|j                  }t        j                  dkD  xs t        j                  dk(  xr |}| j                  ||||      }|D ]v  \  }}}	| j                  ||g|      }
| j                  |j                  |
       t        j                  d|
       |j                  t        j                  j                  |
       x | j                          y )Nr=   z"ComboKernels: generated kernel %s.)get_subkernel_nodesuse_custom_partition_algor  r   combo_kernel_allow_mixed_sizesr  r  r  snodesrJ  r5  r  r<   r   r  r   )rn   combo_kernel_noder  r  r  r  r  r  rl   r  r  s              r]   codegen_combo_kernelz#SIMDScheduling.codegen_combo_kernel(	  s    +??A 1 K K+;;;;a? 
11Q6P;P 	  ::2O[
 $4 	BHfa,,X8I7JFSK  !2!9!9;GII:KHqww33[A		B 	&&(r_       c           
        
 dk(  }d 
fd}|j                         \  }
t        |      dk  rt        
      dk  st        |
z         rg S |j                         \  }
 |||r|n
|j                  |            }|D cg c]?  }t	         j                  |j                  |      |j                  |j                        A }	}|	S c c}w )Nr=   c                d   t        |j                        t        |      k(  sJ d|j                  d|       |j                  |j                  g}t	        d t
        j                  j                  |      D              sJ t
        j                  j                  |      D cg c]:  }|j                  t        j                  j                  vrt        |t              r|< }}t        |j                  D cg c]  }|j                   c}      }dd}t        j!                   ||      g|       dd      g}|D ]  }t        j                  j"                  j%                  |j&                  |j                        }	t        |	      t        |      k(  sJ 	 |	j'                  d      dz   }
|
t        |      k(  rt	        d	 |	|
d
 D              r	  ||d
|
        |||
d
       f}t        j                  j"                  j+                  t-        d t/        ||	      D                    }|j                  |v r|dz  }t        j1                  |d         r|dz  }t        j1                  |d         r|dz  }t        j                  j"                  j+                  |t-        t        j                  |            z
        dk\  s|j3                  t        j!                   ||d
|
        |||
d
       g      ||j                                |S c c}w c c}w # t(        $ r Y w xY w)zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c              3  H   K   | ]  }t        |t        t        f        y wrW   )r   r!   r"   )r   r=  s     r]   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>L	  s$       3G 45s    "c                f    t         j                  j                  j                  t	        |             S rW   r  )r  s    r]   collapse_rangeszNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesX	  s"    ww''00v1FGGr_   noner   )r%  rg   scorer=   c              3  &   K   | ]	  }|d k(    ywr  r   r   s     r]   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>n	  s     ;a16;s   Nc              3  2   K   | ]  \  }}|d k7  s|  ywr  r   )r   r  rI  s      r]   r   zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>}	  s       "!-vST"s   r   r%  r  rg   )r  ry  r   r|   )r   
range_varsr  r  r   r  ry  rz  rg   r<   r   r  r   r!   r   CandidateTilingcreate_partial_tilingr   stride_hintsr   
ValueErrorr   r7   r  is_good_sizer   )is_pointwiser  rwdep_sourcesr=  depswrite_namesr  tilingsr$  splittiled_groupsr  r  r  reduction_rangess                r]   tile_rangesz5SIMDScheduling.candidate_tilings.<locals>.tile_rangesA	  s    r}}%V4S8H	&6SS4 88RYY/K $??88E    %??88E88177#:#::sI. D  %"))%D3chh%DEKH
  44(01<  G  4''**77		2==Q7|s6{222
#MM!,q0EF+ ;756?;; ! < $F6EN3#F56N3  ((22! "14VW1E" 
 88{*QJE"//Q@QJE"//Q@QJE GG$$..ioofFV.W XX 
 NN'#&#<#<$3F6EN$C$3F56N$C!" !0$ #(!$
Q4l N[ &E: " s$   $?L8L"L"=L""	L/.L/r  )r  r   r   list[CandidateTiling])	r  r   r   "pointwise_or_reduction_read_writesr  complete_partial_tilingr%  r  rg   )r  r   rj   r  r  r  pointwise_rangespartial_tilingsr%  full_tilingsr  s   `  `      @r]   candidate_tilingsz SIMDScheduling.candidate_tilings<	  s     '!+\	| .2__->** !Q&$%*$%58H%HII .2__->**% ,2B33LA
 *	
  22MM5/ ll[[	
 	
 	
s   ACc                    g dt        |       d }ddgdt        |       }t        g t        ||      t        ||            S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rQ   rR   rS   NrT   rU   )r   r   r  )r  	pw_tilingreduction_tilingpw_prefixesreduction_prefixess        r]   create_tilingzSIMDScheduling.create_tiling	  sY     &s9~o&78#U^,Cc2B.CDVc+y)VC0BDT,UV
 	
r_   c                >    | j                  |r|ng |s|      S g       S rW   )r  )r  r%  r  s      r]   r  z$SIMDScheduling.create_partial_tiling	  s0       "F&F
 	
,.
 	
r_   c                    t        |j                               }d|v }||z  }|t        |      z  g}|r||fn||f} | j                  | S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rS   )r`  r   r7   r  )	r  r%  rj   r  splitsr  total_numelmissing_tilingtiling_argss	            r]   r  z&SIMDScheduling.complete_partial_tiling	  sf     fmmo&f}o-%f(==> )5V^$>6:R 	 !s  +..r_   c           
        |dk(  }t        t        t        t        j                  f             }t        j                  |      D ]  }t        |t        j                        s|j                         }|st        |d         dk(  rC||rdnd   }|g}	|j                  j                         D 
cg c],  }
t        |
t              rt        |
j                        dkD  r|
. }}
|D ]  }
g |
j                  j!                         }t        j"                  j$                  }t&        j(                  j*                  }d}t-        |      D ]#  \  }\  }}||z  }|}|j/                  ||      s# n |j1                  ||      s|dz   }|r|d| n||d }g }|D ]  \  }}t3        j4                  |
j6                  |      }t9        d|j;                  t<              |j;                  t>              z   t        |            }t3        j@                  ||||      }||d   n|g}|jC                  |        |D cg c]F  }t&        j(                  j*                  j1                  |t        j"                  j$                        s|H }}t        |      dkD  s|	jE                  |        |	D ]z  }t9        dt        |      tG        d      z
        }|dz   }tI        |d|       }|ftK        ||d       z   } |jM                  | jO                  | jQ                  | |      ||             |  tS        |t        d      }!|!S c c}
w c c}w )z
        Creates N-dimensional tiling candidates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r=   r   Nr   T)r   reverse)*r   r   ry   r   r  rF   r-  r   r   r/  r  r   r  reads_and_writesr!   r  rv   r   r   r<   r   r   r[  statically_known_geqr   r>   get_subexpr_involving_symbolr   rj  r   r   r   match_mod_div_block_exprrk  r   r^   r7   r(  r   r  r  r	  )"r  r.  pointwise_numelr  r  r  r   node_rangesranges_to_tilenode_tilingsr=  memory_depsall_var_rangespointwise_vars_numelr   pointwise_end_idxr]  _varrj   reduction_start_idxri   index_tilingvarr   num_dimsmatch_resultdimsdimnode_tilingnum_leading_dimsfirst_trailing_dimcollapsed_leading_dimcollapsed_splitsranked_tilingss"                                     r]   get_nd_tilingszSIMDScheduling.get_nd_tilings	  s    '!+^CO<=?#**=9 a	DdI$;$;< //+KCA$71$< )lBN*+L  ++<<>c9-#cjj/A2E K 
 # ;6 "73::#3#3#5!6',ww{{$77++$%!*3N*C &C$(E1((+%44,o   77(/  '8!&;# $ ##7$78'(;(<=   "", .JC/LL		3E
  #H-O0LLN+ H $7#O#OsE8$L /;.F<?UGD ''-%..  , 77++CCCU     |$q( ''5w;6|  , #&q#k*:]1=M*M#N %5%9"(5kBUCU6V(W%$9#;e 2 34? $  //112BLQ''ia	J  
 ur s   .1L70AL<c                    j                   sdnj                   j                  j                  j                  j                  j                  j                  j
                  }D cg c]  }||   	 c}D cg c]  }||   	 c}t        j                  t              k(  fd       t        j                  t              k(  fd       i g }	 	 	 d	 	 	 	 	 	 	 df	d}|j                   |d       |d      f       r$|j                   |fdd       |d      f       j                  j                         z  }	|	D ]%  }|j                   ||fd       |d      f       ' t        d	
      d	k(  rBdk(  r=t        j                  |	d      D ]$  }
|j                   ||
d       |d      f       & g }|D ]b  \  \  }}\  }}t        | j!                  ||      t#        |      t#        |      z         }| j!                  ||      }|j                  ||f       d | j!                  gg      }ddt#        j$                  j'                               fd}t)        ||      D ]  \  }}| j+                  |j,                        s|j,                  |k(  rt/        |j,                        dk(  rdndz
  }|t        d	
      kD  rDt0        j3                  d|t        j4                  j6                  j8                  j:                         |j,                  |fc S |j,                  |k(  s|j,                  |fc S  |dfS c c}w c c}w )zr
        Generates a tiling, and a score of each tile according to each tile's coalesced memory accesses.
        Nc                      d d  S Nr   r   )r.  r  	pw_rangess   r]   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    ykO#4B}oF r_   c                      d d  S r(  r   )r.  
red_rangesr  s   r]   r   z8SIMDScheduling.compute_tiling_strategy.<locals>.<lambda>
  s    zl"_$5RG r_   Fc                @  	 |rn}|rn}|s|r|gg fS g g fS t        |       ||f}j                  |      x}r|S |rn}g }g }	d}
d}t        ||      D ]  \  }}|| vr"|
|z  }
j                  j                  |d      }-|r|k(  rj                  }|J |j
                  }t        ||j
                        }|j                  |
|z         |	j                  |j                         |j                  |       |	j                  j                  j                  |d             d}
d}|
|z  }
|j                  |
       |	j                  j                  j                  |d             d}
 |
dk7  s|r0t        |      dk(  r"|j                  |
       |	j                  |       t        t        |            D ]S  }t        j                  j                  j                  ||   d      }t        |d      }t!        |	|   |z  dz        |	|<   U ||	f|<   ||	fS )z]
            Generate a tiling, and a tiling score, given vars to use as splits.
            r=   r   r  r   r  )r   r   r  coalesced_by_varsuggested_splittiling_factorr   r   r  r   rt  r<   r   r   r   r  r   )vars_to_useuse_split_varr  r  target_numelr   r  splitting_varsr  split_scoresprodprev_var_coalesced_scorer!  v_range
var_tilingtile	remainderr;  r   all_iter_varsall_red_varsr7  r  r)  r+  r  scored_sub_split
tiling_vars                      r]   process_node_varszASIMDScheduling.compute_tiling_strategy.<locals>.process_node_vars
  sX    #/YJF.:?L)NB//8O$m\BC&**3//s/
.:]NFLD'($ ".&9 
7K'GOD/@/Q/Q/U/U10,  Q*_!2!B!BJ%111%33D (*2J2J KIMM$"23 ''
(8(89MM$' ''(9(J(J(N(NqRS(TUD/0,d###$5$F$F$J$J1a$PQ;> qy\c&kQ.>d###$<= 3v;' ?GG$$..vay2.F1I"%l1o&9A&=">Q?
 &,\$:S!L))r_   T)r  )r1  r  r   r\   r=   r   )r  gffffff?gGz?c                    d}| d   j                   j                         D ]"  }t        j                  |      s|z  }|z  }$ dz  }| d   j                  |z    |z  S )Ng      ?r   g?)r%  r   r  r  r  )r  score_factor	tile_sizeuncoalesced_penalty"bad_size_additional_tiling_penaltygood_size_tiling_penaltytotal_uncoalesceds       r]   	score_modz9SIMDScheduling.compute_tiling_strategy.<locals>.score_mod  sw    LqT[[//1 K	&33I>#/2T#TL#/2J#JL	K #4d":qTZZ"556EEr_   r   r   zmFound optimal tiling with %s tiles but torch._inductor.config.triton.max_tiles set to %s. Consider increasing)r   FF)r0  ztuple[sympy.Expr, ...]r1  r   r  r   r   ztuple[list[int], list[int]])r.  r  norm_read_writesr   reduce_varsri   rX   _checkr7   r   r-  rh  r^   r  combinationsr  r  rE  uncoalesced_addrsr   r	  tiling_is_compatibler%  r   perf_hint_loginforY   r   rZ   r[   )r  r.  r  r  r7  r  r!  score_splitr?  overlapping_iter_varsr0  r  pw_splitpw_score	red_split	red_score	candidaterY  default_tilingrH  cand
tiling_lenr;  r<  rE  rF  r)  r+  r=  r>  rG  s    ````                 @@@@@@@@@r]   compute_tiling_strategyz&SIMDScheduling.compute_tiling_strategyf
  s    %44 "2266 	 *::EE(99EE"33>>(561VAY6	)56AfQi6
)$7F	

 	*%8G	
 DF  	
 35"'!&K	*/K	*K	* K	* )	K	* K	*\ 	!t4!u5	
 %#T &59	 ->>CCEE 	 ' 	A%qd>%59	 #q(_-A(556KQO "")+DI)u= RT<G 	68 Xx"89i'!!(I6(mc)n4I ,,XyALNNI|45	6 **O+<>OP .3*#(  1 C C J J LM	F #)i"@ 	1D,((!?OT[[ ;;.0 !-o6JPQR
a 88!&&9"..55??	 {{L00 {{n,{{L00/	12 t##G 76s   8M6M;c                T    t        t              sJ t        fd|D              S )Nc              3     K   | ]R  }t        |t        j                        r6t        j	                  j                         |j                                 T ywr  )r   r   r/  r}   r  r   r  )r   r   r  r%  s     r]   r   z6SIMDScheduling.tiling_is_compatible.<locals>.<genexpr>I  sO      
 $	 7 78	 $$!2O % 
s   AA)r   rx  r   )r  r.  rj   r  r%  s      ``r]   rN  z#SIMDScheduling.tiling_is_compatible@  s1     &$''' 
 &	
 
 	
r_   c                B    |D ]  }| j                  ||||      s|c S  y rW   )rN  )r  r.  rj   r  r$  r%  s         r]   get_first_compatible_tilingz*SIMDScheduling.get_first_compatible_tilingQ  s1     % 	F''uovV	 r_   c                0    | j                  ||||      d   S r  )rM  )r  r.  rj   r  r7  s        r]   r  zSIMDScheduling.select_tiling_  s)     ((5/3D

 	r_   c                   |dk(  }| j                  |g|g      }t        j                  |      D ]  }t        |j                  t
        j                        s(|j                  j                         dk(  sFt        j                  j                  sa|j                         }|d   }	|d   }
| j                  |	|
      }|dfc S  t        j                  j                  j                  j                  r0|r.t        j                  j                  s| j!                  ||||      S |st        j                  j"                  rt%        d      dk  rt&        j(                  t*        j,                  k  rt        j                  |      D ]i  }t        j                  j"                  rt/        | j1                  |||            dkD  s>t&        j3                  t5        j6                  d              |dfS  |dfS t9               }t;        j<                         }t        j                  |      D ]g  }| j1                  |||      D ]O  }|j>                  |v r|j>                  |jA                  |j>                         ||xx   |jB                  z  cc<   Q i |jE                         D cg c]  \  }}|jF                   }}}t%        d      dk\  r?|r=	 	 	 	 	 	 dd	}tI        dt/        |            D ]  } ||d   ||         }||g|z   } n t/        |      dkD  rt&        j3                  d
|       t        j                  j                  r| jK                  |||      |z   }| jM                  ||||      x}r|dfS |dfS c c}}w )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r=   r  r   Nr   r@  z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                r   c                8   | d   | j                  dd      }}|d   |j                  dd      }}t        ||g      s/t        j                  j                  j                  ||z
        dk(  ry t        j                  j                  j                  ||z
        dk  r||f||fc\  }}\  }}t        j                  j                  j                  ||z
        dkD  sJ t        j                  j                  j                  ||      sy |t        ||      || d   d}|S )NrS   rR   r=   r   rT   )rQ   rR   rS   rT   )r   r   r<   r   r   r   r  r   )tiling0r  a0a1b0b1
new_tilings          r]   convert_tiling_to_3dzBSIMDScheduling.get_tiling_and_scores.<locals>.convert_tiling_to_3d  s    !w{{3':B w{{3':B *2r(3ww''11"r':a?77##--b2g6:*,bB8&HRhr2ww''11"r':Q>>>ww''DDRL !"b)"5>	
 "!r_   zpossibly bad tiling: %s)rc  rt  r  rt  r   rv  )'r  rF   r-  r   r   r   r0  r1  r   rZ   r-  r  rX   rY   r9  prefer_nd_tilingr[  tile_reductionsr^   rO  levelloggingWARNINGr   r  rP  textwrapdedentr   collectionsr   rg   r   r  most_commonr%  rt  r%  r_  )r  r.  rj   r  r7  r  rX  r   r  	range_y_xrange_rr%  
seen_namescandidate_tilescandidate_tilingr  r$  ri  r;  new_3d_tilings                       r]   rM  z$SIMDScheduling.get_tiling_and_scoresk  s   " '!+ **E7_4EF $**=9 	(D$))R%6%67II002e;33 #'//"3K +AI)!nG ..y'BF!4<'	(  OO""))BB!MM22..uo7H  V]]%B%B}H
H ""goo5+22=A D"MM99 5 5dE? STWXX%**$OO!$ !4'' "4''&0l
4?4G4G4I#**=9 	LD$'$9$9$$W L #((J6%**6NN#3#8#89 015E5K5KK1L	L ,;+F+F+H7
' % ##7
 7

 #q(\"."9N"0"8 1c.12  4"1%~a'8! !,&3_~%EN ~"8.I ==))""=%I ! 
 445/>
 
6 
 4<t##A7
s   "M;c                     y rW   r   rq   s    r]   flushzSIMDScheduling.flush  rr  r_   c                     yrS  r   rq   s    r]   ready_to_flushzSIMDScheduling.ready_to_flush   rT  r_   c                   t        d |D              st        |d       j                  \  }\  }}| j                  |||      }| j	                  |||      }| j                  |t        |||            }	| j                  ||	       t        j                  d|      5  t        j                  |	      5  |	j                         }
d d d        d d d        nM|d   j                  |      \  }}}t        j                  d|      5  | j                  |||d|      }
d d d        
j                  t!        t"        j$                        d	      }
|
S # 1 sw Y   xY w# 1 sw Y   @xY w# 1 sw Y   LxY w)
Nc              3  <   K   | ]  }|j                           y wrW   )r  )r   r   s     r]   r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>  s     2q1==?2rp  c                4    t        | j                               S rW   r2  r3  s    r]   r   z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>  r  r_   r   )r  r  r   Tr  r  )r  rj  r  r  r  r  rI   r  r   r  r<   r  rp  get_prologue_template_epiloguer  r  ry   r4   r  )rn   r   r  r  r  rj   rc  r.  r%  rl   r  r  templateepilogues                 r]   generate_kernel_code_from_nodesz.SIMDScheduling.generate_kernel_code_from_nodes  sx    2E22!$U0O!P!V!VAv 77ufMM''ufEF%%+M5&I & F 22=&I/1AB3$$V,3 "002	3 3 3 ,18+R+R,(Hh 02BC 00&*"/ 1  ##C(?(?$@)L)3 3 3 3 s0   E3EEE%E	EE"%E.c                    t         rW   rH  )rn   r  r.  rl   s       r]   r  zSIMDScheduling.define_kernel%  rL  r_   )r  N)r  zOptional[OrderedSet[str]]r   ztuple[float, str]rW   )r   z!Sequence[scheduler.SchedulerNode]r7  Optional[CoalesceVarAnalysis])r   z<Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode])rj   r|   rE  zGIterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject, ir.IRNode]]r   r   )r  rI   )r  rI   r   zlist[SIMDKernel])r   r   r   tuple[tuple[int, ...], ...])r   r   r   r   )r   r   r  r   r   r  )r  r   r   Optional[str])F)r  zlist[BaseSchedulerNode]r  r   r  r   r  r   rk  r   r   zlist[tuple[str, Any, Any]])r   r  )r  ry  r  ry  r   immutable_dict[str, sympy.Expr])r%  ry  r  r   r   r  )r%  rt  rj   r|   r  r|   r   r  )r   z%list[immutable_dict[str, sympy.Expr]])
r.  list[NodeScheduleEntry]r  r|   r  r|   r7  rM   r   =tuple[dict[str, sympy.Expr], Optional[dict[str, sympy.Expr]]])r.  r  rj   r|   r  r|   r%  rt  )r.  r  rj   r|   r  r|   r$  zlist[dict[str, sympy.Expr]])r7  r  r   rt  )r7  r  r   r  r   )FN)r  r   )4r   r   r   r   r}   r  r	  r  r  can_fuse_verticalr  r  r  r  r  r  r  r  r;  r  rI  r6  r  r  r  r  r  r  r  r  r  r  r  r   r   r  r  r  r  r%  r[  rN  r_  r   r   r   r  rM  rz  r|  r  r  r   r_   r]   r  r    s   
 (K'Q`6D !"^@8
%2)j RV"5N"	"
Y)| <@
0
 9
(=P=2 $$
$
 
$ $LQ)f
1
	
 -T  B'	$,

'
/2
	$
.  '+m %m 
m^I #(< 0<   $<  	< 
 <   <  
$< |)( Y}  }~ 

,

@T

	(

 

 
$
 
 
)	
 
 /%/ / $	/
 
)/ /( y
 
/y yv W$.W$ $W$ $	W$
 /W$ 
GW$ W$r 
.
 
 $	

 &
 
  .  $	
 4  
 ;?	
 9	 
	 	 
 ;?O$
 9O$ 
GO$ O$b MQ <I D"r_   r  T)frozenc                  @    e Zd ZU ded<   ded<   dZded<   ed        Zy)	r  rt  r%  r   r  Nr  rg   c                r    t         j                  j                  j                  |       } | dk\  xr | dz  dk(  S )z@Somewhat arbitrary heuristic used to boost scores for some sizesr  r   r  )r   s    r]   r  zCandidateTiling.is_good_size/  s5     GG&&q)Bw(AFaK(r_   )r   r   r   r	  rg   r  r  r   r_   r]   r  r  )  s)    !!JD-) )r_   r  c                      e Zd Zy)r  N)r   r   r   r   r_   r]   r  r  6  s    r_   r  )r   )r\   r   r   r   )r  r}  r   ry   )
__future__r   rq  r  dataclassesr   r  rm  r   r  ro  r   typingr   r   r   r   r	   typing_extensionsr
   r   rX   torch._loggingtorch._inductorr   torch._inductor.irr   torch._inductor.tiling_utilsr   %torch.fx.experimental.symbolic_shapesr   torch.fx.immutable_collectionsr   torch.utils._ordered_setr   torch.utils._sympy.functionsr   r   r   torch.utils._sympy.symbolr   r   r   r   _dynamo.utilsr    r   r   r   analyze_preserves_zero_maskr   	codecacher   r    dependenciesr!   r"   r#   collections.abcr$   r&   optimize_indexingr'    runtime.coordinate_descent_tunerr(   runtime.hintsr)   runtime.runtime_utilsr*   r+   r,   r-   r.   r/   utilsr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   virtualizedr:   r;   r<   block_analysisr>   commonr?   r@   rA   rB   r  rC   rD   simd_kernel_featuresrE   rF   rG   rH   rI   rJ   rK   rL   rM   	getLoggerr   rJ  _logginggetArtifactLoggerrO  r4  
fusion_logdoprintr~  r_  r^   	dataclassra   r~   r   r  r  r  r}   r  r  	Exceptionr  r   r_   r]   <module>r     s   "           ? ? %    # 2 B G 9 / L L  & $ $ F . 6 6 ( A < , L L D D   - , / P P :  <<@ g!00<H~~//*E^^--hA
 	78;
 3+ 3+ 3+lO;/ O;d;'? ;'| +;T   x('/*B xvm"^ m"`9 d#	) 	) $	)		 	r_   