
    qi{b                   	   d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlmc mZ d dlmZ  d dl!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z< ddl=m>Z>m?Z?m@Z@ ddlAmBZB ddl@mCZCmDZD ddlEmFZF ddlGmHZH ddlmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\m]Z] ddl^m_Z_ ddl`maZambZbmcZc er$d dlmdZdmeZe d dlfZfdd lgmhZh dd!l@miZi dd"ljmkZk dd#llmmZm  ej                  eo      Zp e[       j                  Zresej                  ej>                  euevf   Zwee@j                  e\f   Zyed$gdf   Zzdjd%Z{dkd&Z|d' Z}e~euef   Zeeseeej0                  f   d(f   eegesed(f   f   f   Z	 	 dl	 	 	 	 	 	 	 	 	 	 	 dmd)Zdnd*Zej                   G d+ d,             Z G d- d.      Z G d/ d$      Zej                   G d0 d1e             Zej                   G d2 d3e             Zej                   G d4 d5e             Zej                   G d6 d7e             Zej                   G d8 d9e             Zej                   G d: d;e             Z G d< d=e      Zej                   G d> d?e             Zej                   G d@ dAe             Zej                   G dB dCe             Zej                   G dD dEe             Zej                   G dF dGe             Zej                   G dH dIe             Z G dJ dK      Zej                   G dL dMe             Zej                   G dN dOe             Zej                   G dP dQe             Zej                   G dR dSe             Z G dT dUe      Zej                   G dV dWe             Zej                   G dX dYe             Zej                   G dZ d[e             Zej                   G d\ d]e             Zej                   G d^ d_e             Zej                   G d` dae             Zej                   G db dce             Zej                   G dd dee             ZeuZeeePf   Z G df dgeY      Z G dh die      Zy)o    )annotationsN)Callable)chaincount)AnyOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)is_opaque_value_type)trace_structured)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfDelayReplaceLineget_benchmark_nameget_dtype_sizeIndentedBuffer#is_codegen_graph_partition_subgraphis_using_cudagraph_partitionLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLinePythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLowering)ExternKernel)BaseSchedulerNode)FxConverterWrapperLinec                8   t         j                  j                  |       }| j                         t         j                  j                  v}| j                         | j                         t        t         j                  j                  j                  |            |fS N)
r3   graphget_allocation_storage_sizeget_nameunaligned_buffersget_device_or_error	get_dtyper0   sizevarssimplify)nodestorage_size	alignments      e/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyrT   a   sr    7766t<Lqww'@'@@I  " 	!''""++L9:     c                   | j                         |j                         k7  ry| j                         |j                         k7  ryt        j                  j                  j                  t        j                  j                  |             }t        j                  j                  j                  t        j                  j                  |            }t        |      t        |      k(  sWt        j                  j                  j                  |d|z        r+t        j                  j                  j                  ||      ryy)NFgffffff?T)
rL   rM   r3   rH   rN   rO   rI   r0   statically_known_geqstatically_known_leq)	input_buf
output_buf
input_sizeoutput_sizes       rS   can_match_buffer_sizer]   o   s     $$&**H*H*JJ
 4 4 66!!**	++I6J ''""++	++J7K 	*;!77 	
--k4*;LMGG11+zJrU   c                   t        | t        j                        r?| j                         }|j                  |j
                  |j                  |j                  dfS g }| }t        |t        j                  t        j                  t        j                  f      rj|j                         }|y|j                  |       |j                  }t        |t        j                  t        j                  t        j                  f      rjt        |t        j                        sy|D ]+  }|j                  |j                         j                  k7  s+ y |j                         }|j                  |j
                  |j                  |j                  dfS )a  
    Collapse a chain of ReinterpretView <- StorageBox
    <- ReinterpretView <- StorageBox.... <- buffer wrappers if every layer
    has the same offset as the innermost (base) buffer.

    Returns:
        (size, stride, offset, dtype, collapsible: bool)
    T)NNNNF)
isinstancer!   Buffer
get_layoutsizestrideoffsetr   	TensorBox
StorageBoxr$   appenddata)rh   laylayoutscurbase_lays        rS   codegen_reinterpret_view_helperrm      s!    $		"ooxxSZZD@@G
C
S2<<8J8JK
Lnn;0shh S2<<8J8JK
L c299%,  1::)00001 ~~H==(//8??HNNDPPrU   .c                    t               dd	 d	 	 	 dfd}dd fd}d  } |d| d       r4t        j                  j                  rj                  j                         nt        j                         }j                         5  |5  t        j                  j                  rV|rTt        j                  j                  r:|t        j                  j                  v rt        j                  j                  |   }	nd gt        |      z  }	t        |      dk(  r" ||d   |	d         \  }
} |d	|
 d	|        nt        |      dkD  sJ t        |      t        |      k(  sJ t               }t        t        |||	      d
 d      D ]  \  }
}}g }|j                  r:|j                  D ]+  }|dvs|j!                  d| d|j                  |           - |rdj#                  |      }nd} ||
|      \  }
}d| d|
 }||v r~|j%                  |        ||d| d|         d d d        d d d        |j'                         fS # 1 sw Y   #xY w# 1 sw Y   'xY w)Nc                d    t        | t        j                        r| S t        j                  |       S rG   )r_   sympyr   Integer)items    rS   _convert_to_sympy_exprz@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_expr   s#    !$

3tLt9LLrU   c                    t        |       r| | fS t        fd| D              }|s|}j                  |      t        j                  j
                  r$j                  t        fd|D                    fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc              3  .   K   | ]  } |        y wrG    ).0grs   s     rS   	<genexpr>zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s     C1!4Cs   c              3  T   K   | ]  }j                  |t        |             ! y wrG   generate_example_arg_valuetype)rw   rx   wrappers     rS   ry   zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>   s*        ::1d1gF   %()callabletuplecodegen_python_shape_tupler    tritonautotune_at_compile_time)gridexample_grid
sympy_gridrs   r~   s      rS   determine_gridz8user_defined_kernel_grid_fn_code.<locals>.determine_grid   s     ?htn:CdCC
%L..z: ==99 22 !- 
 	
 
 	
rU   c                    j                  |        rJt        j                  j                  r/j                  vr j
                  j                  |xs |        y y y y rG   )	writeliner    r   r   kernel_autotune_nameskernel_autotune_calls)liner   nameoutputr~   s     rS   r   z3user_defined_kernel_grid_fn_code.<locals>.writeline   sW    66G999))33L4HDI : 7 rU   grid_wrapper_for_def z(meta):r4   r   zreturn c                2    t        | d   j                        S Nr4   lenkwargsxs    rS   <lambda>z2user_defined_kernel_grid_fn_code.<locals>.<lambda>
  s    c!A$++. rU   Tkeyreverse)matrix_instr_nonkdimwaves_per_eukpackzmeta['z'] == z and Trueif z	: return )rr   Union[int, sympy.Expr]return
sympy.ExprrG   )r   
TritonGridr   zOptional[TritonGrid])r   strr   Optional[str])r+   r    r   r   r   indent
contextlibnullcontextr3   rH   autotuning_gridsr   r   sortedzipr   rg   joinaddgetvalue)r   configsgridsr~   original_fxnode_namer   r   fn_namekernel_autotune_calls_indentexample_gridsr   r   seenc
guardslistkwargguards	statementrs   r   s   `  `              @@rS    user_defined_kernel_grid_fn_coder      s    FM
 .2

*
>J J "$(GWIW%& v}}== 	%%,,.##% !
 
 .L6 .LMM22$(($(@(@@GG445IJM!FSZ/Mu:?!/a-:J!KD,v&',(@Au:>!>u:W---$.LD *0E7M2.* L%a
  
88!" W  ) 
 '--ugVAHHUOCT.UVW $\\*5F#F%3D,%G"l!&4&9	$#)s6()L>%JK1L-.L .L` FOO%%%a.L .L .L .Ls,   I!D%I<A7I3I!I	I!!I*c                    t               j                  | j                  d       ddlddlm ddlm t        | j                  g      fd |        j                         S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   N)JITFunction)	constexprc           	        t        d t        j                  | j                        D              }| j                  j                  j                  di       }| j                  j                  j                  D ]w  }|v r	|| j                  j                  v s"| j                  j                  |   }t        |      rX	j                          	j                  d       	j                  |j                  d       j                  |        |       t        d      rt        |j                  j                   j"                        rY	j                          	j                  d       	j                  |j                  d       j                  |        |       .t        |t$        t&        t(        
f      r	j                          t        |
      rd|j*                  d	}n|}|j                  |      x}rKt        |t,              rd
|j.                   d|j0                   }nd
|}	j                  | | d|        n	j                  | d|        j                  |       ||v s|dk7  st        |d      s|j.                  j3                  d      s9	j                  d|j.                   d|j0                   d|        j                  |       z y )Nc              3  R   K   | ]  }|j                   d k(  r|j                   ! yw)LOAD_GLOBALN)opnameargval)rw   insts     rS   ry   z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>:  s(      '
{{m+ KK'
s   %'__annotations__z@triton.jitTr   constexpr_functionz@triton.constexpr_functionztl.constexpr(): . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__get__code__co_namesr_   newliner   splicesrcr   hasattrruntimejitConstexprFunctionintr   boolvaluer}   r   __name__
startswith)
cur_kernelunqualified_loadsglobal_annotationssymbol_namesymbol
symbol_str
annotationannotation_coder   compile_wrapperr   symbols_includedtraverser   s           rS   r   zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse5  s   
 ' '
Z]]3'
 

 (]]66::;LbQ%==11:: 9	6K..jmm777#22;?fk2#++-#--m<#**6::T*B$((5V$V%9:zNN&&88	@ $++-#--.JK#**6::T*B$((5V$c4(CD#++-!&)4'4V\\4DA%F
(.z
%7%;%;K%HHzH%j$7"$Z%:%:$;1Z=P=P<Q R , 13:..AO'11*mO+<C
|L (11[MZL2QR$((5#44#t+5 ))44X>
 $-- 1 12(6??:K4P[}] %((5s9	6rU   )
r+   r   r   r   r   triton.languager   r   r   r   )kernelr   r   r   r   r   r   s    @@@@@@rS   9user_defined_triton_kernel_transitive_closure_source_coder   %  si    
 %&O6::T2 ") "6??"34D6 D6L V##%%rU   c                  (    e Zd ZU ded<   ded<   d Zy)SymbolicCallArgsympy.Symbolinnerr   
inner_exprc                ,    t        | j                        S rG   )r   r   selfs    rS   __str__zSymbolicCallArg.__str__  s    4::rU   N)r   r   __qualname__r   r   rv   rU   rS   r   r     s    rU   r   c                  6     e Zd Z fdZddZddZddZ xZS )MemoryPlanningStatec                l    t         |           t        j                  t              | _        d| _        y Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_size)r   	__class__s    rS   r  zMemoryPlanningState.__init__  s-    ##D) 	 12(rU   c                L    t        | j                  j                  |d             S rG   )r   r  r   )r   r   s     rS   __contains__z MemoryPlanningState.__contains__  s    DOO''T233rU   c                \    | j                   |   j                         }|j                  rJ |S rG   )r  pop	is_reusedr   r   rr   s      rS   r  zMemoryPlanningState.pop  s+    s#'')>>!!rU   c                \    |j                   rJ | j                  |   j                  |       y rG   )r  r  rg   r  s      rS   pushzMemoryPlanningState.push  s&    >>!!##D)rU   )r   ReuseKeyr   r   )r   r  r   FreeIfNotReusedLine)r   r  rr   r  r   None)r   r   r   r  r	  r  r  __classcell__r  s   @rS   r   r     s    24
*rU   r   c                      e Zd ZddZy)rE   c                0    t        dt        |              )Nz&FX codegen not yet supported for type )NotImplementedErrorr}   r   	converters     rS   
codegen_fxzWrapperLine.codegen_fx  s    !$J4PT:,"WXXrU   Nr  rD   r   FxConversionFuncr   r   r   r  rv   rU   rS   rE   rE     s    YrU   c                  :    e Zd ZU ded<   ded<   d	dZd
dZddZy)EnterSubgraphLinePythonWrapperCodegenr~   rA   rH   c                b    | j                   j                  | j                   j                         y rG   )r~   push_computed_sizescomputed_sizesr   s    rS   __post_init__zEnterSubgraphLine.__post_init__  s    (()D)DErU   c                n    | j                   j                  | j                         |j                          y rG   )r~   push_codegened_graphrH   	do_indentr   codes     rS   codegenzEnterSubgraphLine.codegen  s"    ))$**5rU   c                    |j                   S rG   )_generate_enter_subgraphr  s     rS   r  zEnterSubgraphLine.codegen_fx  s    111rU   Nr   r  r)  r+   r   r  r  r   r   r   r   r$  r*  r  rv   rU   rS   r  r    s    !!F2rU   r  c                  <    e Zd ZU ded<   ded<   ddZed	d       Zy)
ConditionalLiner   r~   zir.ConditionalrP   c                    t        d      )NzOnly supports FX codegen)r  r(  s     rS   r*  zConditionalLine.codegen  s    !"<==rU   c                    | j                   S rG   )_generate_conditionalr  s    rS   r  zConditionalLine.codegen_fx  s    ...rU   Nr.  r  r   r   r   r   r*  staticmethodr  rv   rU   rS   r1  r1    s'    !!
> / /rU   r1  c                  2    e Zd ZU ded<   ddZedd       Zy)CommentLiner.   r   c                :    |j                  | j                         y rG   )r   r   r(  s     rS   r*  zCommentLine.codegen  s    tyy!rU   c                    | j                   S rG   )_generate_commentr5  s    rS   r  zCommentLine.codegen_fx  s    ***rU   Nr.  r  r6  rv   rU   rS   r9  r9    s!    
" + +rU   r9  c                  <    e Zd ZU ded<   ded<   ddZed	d       Zy)
DynamicScalarLiner   r~   zir.DynamicScalarrP   c                N    | j                   j                  | j                         y rG   )r~   _codegen_dynamic_scalarrP   r(  s     rS   r*  zDynamicScalarLine.codegen  s    ,,TYY7rU   c                    | j                   S rG   )_generate_dynamic_scalarr5  s    rS   r  zDynamicScalarLine.codegen_fx  s    111rU   Nr.  r  r6  rv   rU   rS   r>  r>    s'    !!
8 2 2rU   r>  c                  0    e Zd ZU ded<   ddZddZd	dZy)
ExitSubgraphLiner   r~   c                V    | j                   j                         | j                   _        y rG   )r~   pop_computed_sizesr#  r   s    rS   r$  zExitSubgraphLine.__post_init__  s    &*ll&E&E&G#rU   c                X    | j                   j                          |j                          y rG   )r~   pop_codegened_graphdo_unindentr(  s     rS   r*  zExitSubgraphLine.codegen  s    ((*rU   c                    |j                   S rG   )_generate_exit_subgraphr  s     rS   r  zExitSubgraphLine.codegen_fx  s    000rU   Nr-  r.  r  r/  rv   rU   rS   rD  rD    s    !!H1rU   rD  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
EnterDeviceContextManagerLiner   
device_idxOptional[int]last_seen_device_guard_indexc                x   t         j                  j                  r|j                  d       t         j                  j                  rg| j
                  ;|j                  t         j                  j                  j                          d       y | j
                  | j                  k(  s{J d       | j
                  H|j                  t         j                  j                  j                          d| j                   d       y |j                  d| j                   d       y y |j                  dt         j                  j                  j                  | j                         d       |j                          |j                  t         j                  j                  j                  | j                               y )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r3   rH   cpp_wrapperr   aot_moderP  
device_opscpp_aoti_stream_guardrN  cpp_aoti_device_guarddevice_guardr'  
set_devicer(  s     rS   r*  z%EnterDeviceContextManagerLine.codegen  sP   77NN4 ww 44<NN77--CCEFFop  <<O NO 44<NN77--CCEFnUYUdUdTeegh NN%<T__<MR#PQ P NNU177#5#5#B#B4??#S"TTUVWNNNN177--88IJrU   c                    |j                   S rG   )&_generate_enter_device_context_managerr  s     rS   r  z(EnterDeviceContextManagerLine.codegen_fx
  s    ???rU   Nr.  r  r   r   r   r   r*  r  rv   rU   rS   rM  rM    s    O"//K:@rU   rM  c                      e Zd ZddZddZy)ExitDeviceContextManagerLinec                Z    t         j                  j                  s|j                          y y rG   )r3   rH   rU  rI  r(  s     rS   r*  z$ExitDeviceContextManagerLine.codegen  s     ww"" #rU   c                    |j                   S rG   )%_generate_exit_device_context_managerr  s     rS   r  z'ExitDeviceContextManagerLine.codegen_fx  s    >>>rU   Nr.  r  r   r   r   r*  r  rv   rU   rS   r`  r`    s    ?rU   r`  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelAllocLiner   r~   ir.ExternKernelAllocrP   c                    | j                   }g |j                         |j                         }| j                  j	                  | j                   |       y rG   )rP   codegen_argscodegen_kwargsr~   $_generate_extern_kernel_alloc_helper)r   r)  rP   argss       rS   r*  zExternKernelAllocLine.codegen  sD    yy=""$=t':':'<=99$))TJrU   c                    |j                   S rG   )_generate_extern_kernel_allocr  s     rS   r  z ExternKernelAllocLine.codegen_fx!  s    666rU   Nr.  r  r^  rv   rU   rS   rf  rf    s    !!
K
7rU   rf  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ExternKernelOutLiner   r~   ir.ExternKernelOutrP   c           	     D   | j                   }g |j                         |j                  d      }|j                         }t        j
                  j                  r|j                  dk(  rd}n|j                         }|j                         x}r|j                  nt        j
                  j                  }| j                  j                  ||j                         |j                  r|j                  j                         nd ||| j                   j                                y )NT)skip_outztorch::inductor::_mm_plus_mmaoti_torch__mm_plus_mm_out)rP   ri  rj  get_kernel_namer3   rH   rU  cpp_kernel_name
get_devicer}   device_typer~   "_generate_extern_kernel_out_helpercodegen_referenceoutput_viewget_stack_traces)r   r)  rP   rl  kernel_nameddevices          rS   r*  zExternKernelOutLine.codegen*  s    yyJ""$Jt':':D':'IJ**,GG$$(FF 7K..0K!%!22A29L9L77""$484D4DD..0$II&&(	
rU   c                    |j                   S rG   )_generate_extern_kernel_outr  s     rS   r  zExternKernelOutLine.codegen_fx@      444rU   Nr.  r  r^  rv   rU   rS   rp  rp  %  s    !!

,5rU   rp  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
FreeLiner   r~   %Union[BufferLike, ir.TorchBindObject]rP   c                    | j                   j                         t        j                  j                  vsJ |j                  | j                  j                  | j                                y rG   )rP   rJ   r3   rH   removed_buffersr   r~   make_buffer_freer(  s     rS   r*  zFreeLine.codegenI  sF    yy!!#177+B+BBBBt||44TYY?@rU   c                    |j                   S rG   )_generate_freer  s     rS   r  zFreeLine.codegen_fxM      '''rU   Nr.  r  r^  rv   rU   rS   r  r  D  s    !!
//A(rU   r  c                      e Zd ZU ded<   ded<   ded<   ded<   ded<   d	ed
<   ded<   ded<   ded<   ded<   ded<   ddZddZy)KernelCallLiner   r~   r   r}  ztuple[Any, ...]	call_argsraw_keysraw_args	list[str]	arg_typesr   r   zdict[str, Any]triton_metaztorch.devicer  
graph_namer   c                   | j                   j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  
       y )N)r   r  r  r  r  r  r  r   )r~   _generate_kernel_call_helperr}  r  r   r  r  r  r  r  r  r   r(  s     rS   r*  zKernelCallLine.codegen_  se    11NN;;nn]]]]((;;!%!:!: 	2 	
rU   c                    |j                   S rG   )_generate_kernel_callr  s     rS   r  zKernelCallLine.codegen_fxm      ...rU   Nr.  r  r^  rv   rU   rS   r  r  Q  sL    !!LO
/rU   r  c                  f    e Zd ZU ded<   ded<   ded<   dZded<   d	Zd
ed<   dZded<   ddZddZy)KernelDefinitionLiner   r~   r   r}  kernel_bodyNr   metadataTr   gpucpp_definitionc                    | j                   j                  | j                  | j                  | j                  | j
                  | j                         y N)r  r  r  )r~   _define_kernel_helperr}  r  r  r  r  r(  s     rS   r*  zKernelDefinitionLine.codegenz  sB    **]].. 	+ 	
rU   c                    |j                   S rG   )_generate_kernel_definitionr  s     rS   r  zKernelDefinitionLine.codegen_fx  r  rU   r.  r  )	r   r   r   r   r  r  r  r*  r  rv   rU   rS   r  r  q  s<    !!"Hm"C$(NM(
5rU   r  c                  0    e Zd ZU ded<   ddZddZd	dZy)
MemoryPlanningLiner   r~   c                    | S )zFirst pass to find reuserv   r   states     rS   planzMemoryPlanningLine.plan  s    rU   c                     y)zSecond pass to output codeNrv   r(  s     rS   r*  zMemoryPlanningLine.codegen  s    rU   c                r   g }t        j                  |       D ]t  }|j                  dk(  rt        | |j                        }|j	                  |j                   d|j
                  t        j                  u r|j                         n|        v t        |       j                   ddj                  |       dS )zF
        Emits a string representation that fits on one line.
        r~   =(, r   )dataclassesfieldsr   getattrrg   r}   r!   r`   rJ   r   r   )r   rl  fieldvals       rS   r   zMemoryPlanningLine.__str__  s      ''- 	EzzY&$

+CKK::,a%**		2IsST		 t*%%&a		$'8::rU   Nr  r   r   r  r.  r   r   )r   r   r   r   r  r*  r   rv   rU   rS   r  r    s    !!);rU   r  c                  *    e Zd Zd ZddZddZddZy)EfficientPeakEstimatec                   ddl m}m} t        j                  j
                  j                  }t        t        j                  j                  j                               }t        t        j                  j                               } |||      } ||||      \  | _        }ddlm}  ||t        j                  t         d      | _        y )Nr   )estimate_peak_memoryget_freeable_input_bufr4   )SegmentedTreer   )memoryr  r  r3   rH   	schedulernodesr   graph_inputskeysget_output_namesoverall_peak_memorysegmented_treer  operatorr   max)	r   r  r  scheduler_nodesr  graph_outputsnames_to_freeable_bufspeak_by_scheduler_noder  s	            rS   r  zEfficientPeakEstimate.__init__  s    I''++11!!''"6"6";";"=>"177#;#;#=>!7!V;O"<
8 "8 	2+"HLL#q
rU   c                    t         j                  j                  j                  t         j                  j	                  |      d      t        |j                               z  S )Nr   fallback)r3   rH   rN   	size_hintrI   r*   rM   r   rP   s     rS   	_get_sizezEfficientPeakEstimate._get_size  sL    ww))GG//5 * 
4>>+,- 	-rU   c                n    | j                   j                  |j                  dz   |j                  dz
        S r   )r  summarize_rangescheduler_node_indexr   line_aline_bs      rS   peak_betweenz"EfficientPeakEstimate.peak_between  s6    ""22''!+V-H-H1-L
 	
rU   c                    |j                   dz   |j                   k(  ry | j                  j                  |j                   dz   |j                   dz
  | j                  |j                               y r   )r  r  update_ranger  rP   r  s      rS   update_peak_betweenz)EfficientPeakEstimate.update_peak_between  s^    &&*f.I.II((''!+''!+NN6;;'	
rU   N)rP   
BufferLiker   r   )r  r  r  AllocateLine)r   r   r   r  r  r  r  rv   rU   rS   r  r    s    
&-



rU   r  c                  >    e Zd ZU ded<   d Zd	dZd
dZddZddZy)r  r  rP   c                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rG   r3   rH   r  current_noder  indexr  r   s    rS   r$  zAllocateLine.__post_init__  T    ww  --999$%GG$5$5$;$;$A$AGG**%
!rU   c                    |j                   dz   | j                   k(  ry| j                  j                  j                  }| j                  j                  j	                  ||       }||z   }||k  S )Nr4   T)r  r~   estimate_peakr  r  )r   	free_linerb   r  peak_memory_in_rangenew_peak_memorys         rS   should_reuse_bufferz AllocateLine.should_reuse_buffer  si    ))A-1J1JJ"ll88LL#||99FFyRVW!55"555rU   c           	        | j                   j                         t        j                  j                  v rt        | j                        S t        | j                         }t        j                  r
||v r|j                  |      }t        j                  j                  j                  t        j                  j                  | j                         d      t        | j                   j                               z  }| j!                  ||      rXd|_        | j                  j$                  j'                  ||        t)        | j                  |j                   | j                         S |j+                  ||       | S | j                   j-                         j.                  dk(  rh| j                  j1                  | j                         }|A|xj2                  t5        t7        j8                  t:        j<                  |d            z  c_        | S )Nr   r  Tcpur4   )rP   rJ   r3   rH   r  NullLiner~   rT   r    allow_buffer_reuser  rN   r  rI   r*   rM   r  r  r  r  	ReuseLiner  rL   r}   static_shape_for_buffer_or_noner  r   	functoolsreducer  mul)r   r  r   r  rb   static_shapes         rS   r  zAllocateLine.plan  s}   99177#:#::DLL)) tyy)$$		#I77##--33DII> . tyy22456D ''	48&*	#**>>y$O y~~tyyII

3	*99((*//58<<GG		RL'11S$$X\\<C6 1 rU   c                    | j                   j                         t        j                  j                  vsJ | j
                  j                  | j                         }|j                  |       y rG   )rP   rJ   r3   rH   r  r~   make_buffer_allocationr   r   r)  r   s      rS   r*  zAllocateLine.codegen  sK    yy!!#177+B+BBBB||22499=trU   c                    |j                   S rG   )_generate_allocater  s     rS   r  zAllocateLine.codegen_fx  s    +++rU   N)r  r  rb   r   r   r   r  r.  r  )	r   r   r   r   r$  r  r  r*  r  rv   rU   rS   r  r    s!    

68
,rU   r  c                  D    e Zd ZU ded<   dZded<   d ZddZddZdd	Zy
)r  r  rP   Fr   r  c                   t         j                  j                  j                  J t         j                  j                  j                  j                  t         j                  j                  j                        | _        y rG   r  r   s    rS   r$  z!FreeIfNotReusedLine.__post_init__  r  rU   c                   t        | j                  j                               dkD  r| S t        | j                  j                  t
        j                        r| S | j                  rJ | j                  j                         t        j                  j                  v rt        | j                        S t        j                  r%|j!                  t#        | j                        |        | S r   )r   rP   get_inputs_that_alias_outputr_   layoutr!   MultiOutputLayoutr  rJ   r3   rH   r  r  r~   r    r  r  rT   r  s     rS   r  zFreeIfNotReusedLine.plan  s    tyy55781<Kdii&&(<(<=K>>!!99177#:#::DLL))$$JJ'		2D9rU   c                    | j                   j                         t        j                  j                  vsJ | j
                  s5|j                  | j                  j                  | j                                y y rG   )	rP   rJ   r3   rH   r  r  r   r~   r  r(  s     rS   r*  zFreeIfNotReusedLine.codegen  sR    yy!!#177+B+BBBB~~NN4<<88CD rU   c                    |j                   S rG   )_generate_free_if_not_reusedr  s     rS   r  zFreeIfNotReusedLine.codegen_fx      555rU   Nr  r.  r  )	r   r   r   r   r  r$  r  r*  r  rv   rU   rS   r  r     s'    
It

E
6rU   r  c                  D    e Zd ZU ded<   ded<   ded<   d
dZddZddZy	)ReinterpretLiner  rP   	reused_asz	ir.Layoutr  c                    | S rG   rv   r  s     rS   r  zReinterpretLine.plan&  s    rU   c                @   t        | j                  t        j                        sJ t        | j                  j                  t        j
                        sJ | j                  j                  | j                  j                         | j                  j                         y rG   )
r_   r  r!   NonOwningLayoutviewr$   r~   codegen_deferred_allocationr  rJ   r(  s     rS   r*  zReinterpretLine.codegen)  sj    $++r'9'9:::$++**B,>,>???00NN##%t{{'7'7	
rU   c                    |j                   S rG   )_generate_reinterpretr  s     rS   r  zReinterpretLine.codegen_fx0  r  rU   Nr  r.  r  )r   r   r   r   r  r*  r  rv   rU   rS   r  r     s#    

/rU   r  c                  H    e Zd ZU ded<   ded<   dZded<   ddZddZdd	Zy
)r  r  rP   r  Tr   
delete_oldc                p   | j                   j                         t        j                  j                  v rK| j
                  j                         t        j                  j                  v sJ t        | j                        S | j
                  j                         t        j                  j                  vsJ | S rG   )rP   rJ   r3   rH   r  r  r  r~   r  s     rS   r  zReuseLine.plan:  s    99177#:#::>>**,0G0GGGGDLL))~~&&(0G0GGGGrU   c                p   | j                   j                         t        j                  j                  vsJ | j
                  j                         t        j                  j                  vsJ |j                  | j                  j                  | j                   | j
                  | j                               y rG   )
rP   rJ   r3   rH   r  r  r   r~   make_buffer_reuser  r(  s     rS   r*  zReuseLine.codegenA  sz    yy!!#177+B+BBBB~~&&(0G0GGGGLL**499dnndooV	
rU   c                    |j                   S rG   )_generate_reuser  s     rS   r  zReuseLine.codegen_fxH  s    (((rU   Nr  r.  r  )r   r   r   r   r  r  r*  r  rv   rU   rS   r  r  4  s'    
J
)rU   r  c                      e Zd ZddZy)r  c                    |j                   S rG   )_generate_nullr  s     rS   r  zNullLine.codegen_fxM  r  rU   Nr  r  rv   rU   rS   r  r  L  s    (rU   r  c                  X    e Zd ZU ded<   ded<   ed	d       Zed
d       Zedd       Zy)CommBufferLiner   r~   	ir.BufferrP   c                    ddl m} | j                  j                         }| j                  j	                         } ||      rt        d| j                         t        |      |j                  z  S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )torch._inductor.utilsr  rP   	get_numelrM   AssertionErrorr   itemsize)r   r  numelr   s       rS   rb   zCommBufferLine.sizeV  sa    5		##%		##%u ?		{K  5zENN**rU   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rG   )rP   get_output_specr_   r!   CommBufferLayoutcomm_buffer_typer   r  s     rS   r#  zCommBufferLine.comm_buffer_typeb  s6    **,&""5"5666&&&rU   c                    | j                   j                         }t        |t        j                        sJ |j
                  S rG   )rP   r!  r_   r!   r"  
group_namer$  s     rS   r&  zCommBufferLine.group_nameh  s6    **,&""5"5666   rU   Nr   r   )r   zir.CommBufferTyper  )r   r   r   r   propertyrb   r#  r&  rv   rU   rS   r  r  Q  sG    !!
O	+ 	+ ' '
 ! !rU   r  c                  ,    e Zd ZddZed        ZddZy)CommBufferAllocateLinec                "   | j                   j                         t        j                  j                  vsJ | j                   j                         }| j                   j                         }| j                   j                         }t        | j                   j                               }t        | j                   j                               }|j                  | j                  | j                  | j                  | j                  |||||             y rG   )rP   rJ   r3   rH   r  rw  rM   r   get_size
get_strider   make_allocation_liner#  r&  r~   )r   r)  r   r  r   shaperc   s          rS   r*  zCommBufferAllocateLine.codegenq  s    yy!!#177+B+BBBByy!!#%%'		##%dii((*+tyy++-.%%%%		
rU   c                    | t         j                  j                  k(  rS| d|j                  |       d|j                  |       d| d|j                   d| dt        j                  dd       dS t        d	|        )
Nz = empty_strided_p2p(r  z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    r   zUnsupported comm buffer type: )r!   CommBufferTypeSYMM_MEMcodegen_shape_tupler  randomrandintr  )r#  r&  r~   r   r  r   r/  rc   s           rS   r.  z+CommBufferAllocateLine.make_allocation_line  s     r00999&-..u56b..v67r' &&,ll^ 4)l +"NN1i89< &01A0BC rU   c                    |j                   S rG   )_generate_comm_buffer_allocater  s     rS   r  z!CommBufferAllocateLine.codegen_fx      777rU   Nr.  r  )r   r   r   r*  r7  r.  r  rv   rU   rS   r*  r*  o  s     
(  $8rU   r*  c                      e Zd ZddZddZy)CommBufferFreeLinec                    | j                   j                  | j                        }|j                  | d| j                  j
                   d       y )Nz # z buffer free)r~   r  rP   r   r#  r   r  s      rS   r*  zCommBufferFreeLine.codegen  s@    ||,,TYY7$s4#8#8#>#>"?|LMrU   c                    |j                   S rG   )_generate_comm_buffer_freer  s     rS   r  zCommBufferFreeLine.codegen_fx      333rU   Nr.  r  rd  rv   rU   rS   r:  r:    s    N4rU   r:  c                  J    e Zd ZU dZded<   ded<   ded<   ded<   dd	Zdd
Zy)MultiOutputLinezU
    Given a MultiOutputLayout buffer, indexes actual buffer(s) from the result.
    r   r~   r   result_namearg_nameSequence[Any]indicesc                      fd  j                    j                        }|j                   j                  j                    j
                   d|  j                  j                          y )Nc                l   t        |      dkD  r|d   \  }}t        |t              r |  d| d|dd        S t        |t              r<j                  j                  | j                  t        |            } ||dd        S t        |t              r |  d| d|dd        S t        d|      | S )Nr   []r4   z['z']znon supported index type: )
r   
issubclassr  r   r~   codegen_tuple_accessrA  r   dictr  )basenamerD  itypeituple_accesscodegen_list_tuple_accessr   s        rS   rP  z:MultiOutputLine.codegen.<locals>.codegen_list_tuple_access  s    7|a"1:qeT*4z1#Q5GQRQSUUu-#'<<#D#D $"2"2CF$L 5\712;OOt,4zA3b5I7STSU;WW()EuMMrU   r   )rB  rD  r   r~   declarerA  ending)r   r)  r   rP  s   `  @rS   r*  zMultiOutputLine.codegen  s]    	 $ *$--F||##$T%5%5$6c%ATAT@UV	
rU   c                    |j                   S rG   )_generate_multi_outputr  s     rS   r  zMultiOutputLine.codegen_fx  s    ///rU   Nr.  r  )r   r   r   __doc__r   r*  r  rv   rU   rS   r@  r@    s*     "!M
00rU   r@  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)IndexPutFallbackLiner   r~   ir.IndexPutFallbackrP   zlist[Optional[ir.IRNode]]rD  c                   | j                   }t        j                  |j                        sJ d |j                  d d D        \  }}| j                  D cg c]*  }|r|j                         n| j                  j                  , }} | j                  j                  |j                         |||g|j                           y c c}w )Nc              3  <   K   | ]  }|j                           y wrG   rz  rw   ts     rS   ry   z/IndexPutFallbackLine.codegen.<locals>.<genexpr>  s     Fq**,F   r   )rP   r!   is_node_sequenceinputsrD  rz  r~   none_str_generate_index_put_fallbackru  codegen_const_args)r   r)  rP   r   valuesidxrD  s          rS   r*  zIndexPutFallbackLine.codegen  s    yy""4;;///Fdkk"1oFF ||
 (+C!!#0E0EE
 

 	211  "Aw	
9=9P9P9R	

s   /Cc                    |j                   S rG   )rb  r  s     rS   r  zIndexPutFallbackLine.codegen_fx  r  rU   Nr.  r  r^  rv   rU   rS   rW  rW    s    !!
&&
6rU   rW  c                  2    e Zd ZU ded<   ded<   ddZd	dZy)
ScatterFallbackLiner   r~   ir.ScatterFallbackrP   c           
     8   | j                   }t        j                  |j                        sJ |j                  rd |j                  D        \  }}}n%d |j                  D        \  }}|j
                  d   }|j                         x}r|j                  nt        j                  j                  }| j                  j                  |||j
                  d   ||g|j                  |j                  |j                  |j                  d   |j!                         |       y )Nc              3  <   K   | ]  }|j                           y wrG   r[  r\  s     rS   ry   z.ScatterFallbackLine.codegen.<locals>.<genexpr>  s     Jq224Jr^  c              3  <   K   | ]  }|j                           y wrG   r[  r\  s     rS   ry   z.ScatterFallbackLine.codegen.<locals>.<genexpr>  s     EA!--/Er^  r4   r   r  )rP   r!   r_  r`  src_is_tensorconstant_argsrw  r}   r3   rH   rx  r~   _generate_scatter_fallbackrv  python_kernel_namer   rj  )r   r)  rP   r   r  r   r~  r  s           rS   r*  zScatterFallbackLine.codegen  s    yy""4;;///JdkkJOQsEEJQ$$Q'C!%!22A29L9L//""1%uc2  ##KK!!		
rU   c                    |j                   S rG   )ro  r  s     rS   r  zScatterFallbackLine.codegen_fx  r>  rU   Nr.  r  r^  rv   rU   rS   rh  rh    s    !!

(4rU   rh  c                  <    e Zd ZU ded<   ded<   ded<   d
dZddZy	)SymbolicCallArgLiner   r~   r   argrA   rH   c                d    | j                   j                  | j                  | j                         y rG   )r~   "_generate_symbolic_call_arg_helperrt  rH   r(  s     rS   r*  zSymbolicCallArgLine.codegen  s    77$**MrU   c                    |j                   S rG   )_generate_symbolic_call_argr  s     rS   r  zSymbolicCallArgLine.codegen_fx
  r  rU   Nr.  r  r^  rv   rU   rS   rs  rs    s    !!	N5rU   rs  c                  F    e Zd ZU ded<   ded<   ded<   ded<   dd	Zdd
Zy)UnbackedSymbolDefsLiner   r~   r   output_namer   outputs,Optional[dict[sympy.Symbol, pytree.KeyPath]]unbacked_bindingsc                z    | j                   j                  | j                  | j                  | j                         y rG   )r~   )_codegen_unbacked_symbol_defs_for_outputsr{  r|  r~  r(  s     rS   r*  zUnbackedSymbolDefsLine.codegen  s+    >>dllD,B,B	
rU   c                    |j                   S rG   )_generate_unbacked_symbol_defsr  s     rS   r  z!UnbackedSymbolDefsLine.codegen_fx  r8  rU   Nr.  r  r^  rv   rU   rS   rz  rz    s#    !!LCC

8rU   rz  c            	          e Zd ZdZdZ fdZe	 d	 	 	 	 	 	 	 dd       ZddZddZ	ddZ
dd	Zdd
Zedd       ZddZedd       ZddZedd       ZddZ	 	 ddZddZddZddZddZddZddZddZddZddZddZd Zd Z d Z!d Z"d  Z#dd!Z$dd"Z%dd#Z&dd$Z'dd%Z(dd&Z)dd'Z*dd(Z+dd)Z,d* Z-	 	 	 	 dd+Z.	 d	 	 	 	 	 	 	 	 	 	 	 	 	 dd,Z/dd-Z0dd.Z1dd/Z2d0 Z3dd1Z4d2 Z5dd3Z6d4 Z7	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd5Z8d6 Z9dd7Z:e;jx                  dd8       Z=dd9Z>d: Z?d; Z@d< ZAd= ZBdd>ZC	 	 	 	 	 	 dd?ZDd@ ZEddAZFdB ZGddCddDZHddCddEZIddFZJddGZKddHZLddIZM	 	 ddJZN	 d	 	 	 ddKZOddLZPddMZQdN ZRdO ZSdP ZTdQ ZUdR ZVdS ZW	 	 	 d	 	 	 	 	 	 	 	 	 ddTZXe	 d	 	 	 	 	 ddU       ZY	 	 	 d	 	 	 	 	 	 	 	 	 ddVZZddWZ[	 	 ddXZ\dddYZ]	 	 	 	 	 	 ddZZ^dd[Z_dd\Z`d] Zad^ Zbd_ Zcd` Zdda Zedb Zfdc Zgdd ZhddeZidf Zjddddddddg	 ddhZkdddddddiddj	 ddkZldl Zmdm Zndn ZoddoZpddpZqedq        Zr	 ddrZsds ZtddtZudduZvddvZwddwZx	 d	 ddxZyddyZzddzZ{dd{Z|d| Z}dd}Z~d~ ZddZd Z	 	 	 	 	 	 	 	 ddZ	 	 	 	 	 	 	 	 ddZd Z	 	 	 	 ddZddZd Zd Zd Zd Zd Zd ZddZd Zed        Zed        Zed        Zed        Zed        Z	 	 	 	 ddZd Zd Z xZS )r   zB
    Generate outer wrapper in Python that calls the kernels.
    Tc                    t                    t                _        i  _        t                _        t                _        t                _        t                _	        t                _
        t                _        t                _        t                _        t                _        t                _        i  _        d _        i  _        t                _        g  _        d _        d _        d _        d _        d _        t6        j8                  j:                  rdnd _        t6        j8                  j:                  rdnd _        d  _         d _!        i  _"        t                _#        t                _$        d  _%         jM                          g  _'        g  _(         jS                          tU               s jW                           jY                          t6        j8                  jZ                  sBt6        j8                  j\                  j_                         D ]  \  }} ja                  ||        t        tb                   _2        t        tb                   _3        i  _4         tk        jl                  d        jn                         _7        tj        jp                  d
 fd       }| _9        i  _:        t                _;        ty                _=        t                _>        i  _?        t        t        j                  j                  t        j                  j                  	       _E        g  _F        y )Nr    #r  z
std::move(r   Tc                    j                   j                  |        t        j                  j                  rj
                  j                  |        y y rG   )importsr   r    r   r   r   )r   r   s    rS   add_import_oncez6PythonWrapperCodegen.__init__.<locals>.add_import_oncer  s;    LL""4(}}55**44T: 6rU   )debug_printer_leveluse_array_ref)r   r   r   r  )Gr   r  r   _names_iterargs_to_buffersr+   r  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   r   kernel_autotune_example_argskernel_autotune_tmp_arg_idxsrc_to_kernelkernel_numel_exprlinesrQ  declare_maybe_referencerR  commentra  r3   rH   rU  
move_beginmove_endrP  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr#  launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerr,   write_prefix!write_kernel_autotune_defs_headerrV  constant_reprsitemswrite_constant
BufferName	allocatedfreedreusesr  	lru_cachewrite_get_raw_streamcacher  _metas
_meta_varsr   multi_kernel_statealready_codegened_subgraphsallocated_workspacesr   r    aot_inductor debug_intermediate_value_printerallow_stack_allocationdebug_printeradditional_files)r   r   hashedr  r  s   `   rS   r  zPythonWrapperCodegen.__init__)  s   */'  	 &'$&$&$&#1#3 *,$2$4!%3%5"$2$4!6@l" IK)01( .0HR!#
')$*+''*=*=,2 ww22;?)+/(QS&L 	" 9C $!!# &("$&!248..0ww ! 6 6 < < > 2f##D&12 $J/1
+-
 57$=I$7$7$=%%%
! 
	; 
	;
  /&(+5<"2"4<FL(46! 1 & 3 3 T T --DD
 !#rU   Nc                D    | r|J |J t        |||      S t               S rG   )SubgraphPythonWrapperCodegenr   )is_subgraphsubgraph_nameparent_wrapperpartition_signaturess       rS   createzPythonWrapperCodegen.create  s?      ,,,!---/~/C  $%%rU   c                    d| _         y )Ncall)r  r   s    rS   r  z)PythonWrapperCodegen.set_launcher_fn_name  s
     &rU   c                D    | j                   j                  | d|        y )Nz = None  # )r  r   )r   r   r  s      rS   r  z#PythonWrapperCodegen.write_constant  s    k&:;rU   c           	        t         j                  j                  j                         }d}||j                  d|j                   }d}t        t        j                  j                        dkD  rd}n0t         j                  j                  j                  j                  rd}| j                  j                  d| dt        j                   d| d	d
       | j                   j                  dd
       	 ddlm} | j                   j                  dd
       t        j*                  r| j                   j-                  d       y y # t&        t(        f$ r Y >w xY w)Nr  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infozFfrom torch._inductor.runtime.debug_utils import tracked_empty_strided
z
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from zq import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                assert_alignment = torch._C._dynamo.guards.assert_alignment
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cpu_pinned = torch._C._dynamo.guards._empty_strided_cpu_pinned
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                empty_strided_mtia = torch._C._dynamo.guards._empty_strided_mtia
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torch_guardsTracingContexttry_getaot_graph_namer   r    r  r  	_inductortest_configstrack_memory_lifecycler  r   r   r   r  torch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r   contextaot_config_commentinductor_debug_utilsr  s        rS   r  z!PythonWrapperCodegen.write_header  sc   --..6687#9#9#E#-g.D.D-E!F!v""CCDqH#w __##00GG#l #$ % $,,- .%& '!$ ' 	 	
* 	 ! 	 	
$	 DKK 	   ##KK!!"?@ $ , 		s   6#E EEc                     y rG   rv   )r   r  s     rS   include_extra_headerz)PythonWrapperCodegen.include_extra_header      rU   c                    | j                   j                  dt        j                   d       	 ddlm} | j                   j                  dd       y # t        t        f$ r Y y w xY w)Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            r   )_cuda_getCurrentRawStreamzU
                get_raw_stream = torch._C._cuda_getCurrentRawStream
                Tr   )r  r   r   r   torch._Cr  r  r  )r   r  s     rS   r  z6PythonWrapperCodegen.write_kernel_autotune_defs_header  sw    !!((
 $,,- .	

	:%%,, 	 -  ^, 		s   #A A%$A%c                   dt         j                   d}t        j                  j                  r]| j
                  j                  |       | j
                  j                  t        j                  j                  j                  d             t        j                  j                  s`| j                  j                  |d       | j                  j                  t        j                  j                  j                  d             y y )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r%   r   r    r   r   r   r   r   r3   rH   rW  import_get_raw_stream_asrU  r  r   
import_strs     rS   write_triton_header_oncez-PythonWrapperCodegen.write_triton_header_once  s     $,,- .

 ==11&&--j9&&00"";;<LM ww""LL
$7LL"""";;<LM #rU   c                   t         j                  j                  j                  d      }t        j
                  j                  r6| j                  j                  |      s| j                  j                  |       t         j                  j                  s8| j                  j                  |      s| j                  j                  |       y y y )Nr  )r3   rH   rW  r  r    r   r   r   containsr   rU  r  )r   import_get_raw_stream_strs     rS   write_get_raw_stream_headerz0PythonWrapperCodegen.write_get_raw_stream_header  s    $%GG$6$6$O$O%
! ==11--667PQ**445NOww""<<(()BC&&'@A D #rU   c                $    | j                          y rG   )r  r   s    rS    write_get_raw_stream_header_oncez5PythonWrapperCodegen.write_get_raw_stream_header_once  s    ((*rU   c                   t        |      }|| j                  vrdt        | j                         }|| j                  |<   | j                  j	                  | d|        t
        j                  j                  r;| j                  j	                  | d|        | j                  j                  |       | j                  |   S )Nmetar   )reprr  r   r  r   r    r   r   r   r  r   )r   r  vars      rS   add_meta_oncez"PythonWrapperCodegen.add_meta_once   s    Dzt{{"T[[)*+C #DKKKK!!SETF"34}}55**44uCv5FG##C({{4  rU   c                z    | j                         D cg c]  }|j                  | j                         c}S c c}w rG   )get_graph_outputsrz  r  r   r   s     rS   get_output_refsz$PythonWrapperCodegen.get_output_refs.  s<     =A<R<R<T
78A 1 12
 	
 
s   "8c                     y rG   rv   r   s    rS   mark_output_typez%PythonWrapperCodegen.mark_output_type4      rU   c                6    t         j                  j                  S rG   )r3   rH   r  r   s    rS   get_graph_inputsz%PythonWrapperCodegen.get_graph_inputs7  s     ww###rU   c                6    t         j                  j                  S rG   )r3   rH   r  r   s    rS   r  z&PythonWrapperCodegen.get_graph_outputs<  s    ww$$$rU   c           
        | j                         j                         D ]  \  }}t        |t        j                  t
        j                  f      r1|t        j                  j                  vst        |t
        j                        rht        |j                               dk(  r| j                  |j                               }| j                  |j                               }| j                  j!                  d| d| d| d        y )Nr   zassert_size_stride(r  r   )r  r  r_   rp   r   r!   TorchBindObjectr3   rH   graph_input_namesGeneratorStater/   r,  r   r-  r  r   )r   r   bufrb   rc   s        rS   codegen_input_size_assertsz/PythonWrapperCodegen.codegen_input_size_asserts?  s    ..0668 	SID##

B,>,>?@ 177444
R&&9  S\\^,1223<<>BD44S^^5EFFKK!!$7vRvRxq"QR	SrU   c                `   | j                   j                  d       | j                         j                         D ]r  \  }}t	        |t
        j                  t        j                  f      r1d| d}| j                   j                  |       d| d}| j                   j                  |       t y )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r  r   r  r  r_   rp   r   r!   r  )r   r   r  r   s       rS   codegen_input_nan_assertsz.PythonWrapperCodegen.codegen_input_nan_assertsQ  s    HI..0668 	(ID##

B,>,>?@ &;<DKK!!$' &;<DKK!!$'	(rU   c                :    | j                   j                  d       y )NzV

            async_compile.wait(globals())
            del async_compile
            )r  r   r   s    rS   write_async_compile_waitz-PythonWrapperCodegen.write_async_compile_wait\  s    	
rU   c                    dj                  |      }t        |      dk(  r|dz  }| j                  j                  | d       | j                  j                  d       y )Nr  r4   ,z = argszargs.clear())r   r   r  r   )r   input_nameslhss      rS   
write_argszPythonWrapperCodegen.write_argse  sP    ii${q 3JCWo.n-rU   c                    t         j                  r| j                  j                  d       d}|S | j                  j                  d| j                   d       d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r4   )r    graph_partitionr  r   r  r   prefix_indents     rS   !write_launcher_fn_call_get_indentz6PythonWrapperCodegen.write_launcher_fn_call_get_indentl  sm    !!KK M  KK**+ ,
 MrU   c                6    t         j                  j                  S rG   )r3   rH   r  r   s    rS   get_graph_input_namesz*PythonWrapperCodegen.get_graph_input_names  s    ww(((rU   c                   | j                   J | j                          | j                         }| j                  j	                  |      5  t
        j                  j                  rA| j                  j                  t        j                  j                  j                                t        j                  j                         }t
        j                  r| j                  j                  d| d       | j                         x}r| j!                  |       | j#                          t%               rt'        |       r| j)                          d d d        y # 1 sw Y   y xY w)Nz0training_annotation = nvtx._device_range_start(''))r  r  r  r  r   r    r   debug_sync_graphr   r3   rH   rW  synchronizeget_training_phaser  r  r  codegen_inputsr-   r,   "codegen_input_size_and_nan_asserts)r   r  phaser  s       rS   r  z!PythonWrapperCodegen.write_prefix  s   $$000%%'>>@[[. 	:}}--%%agg&8&8&D&D&FGGG..0E''%%FugRP %)$>$>$@@ @ 12!
 -.<TB779)	: 	: 	:s   
DEEc                    t         j                  r| j                          t         j                  r| j	                          y y rG   )r    size_assertsr  nan_assertsr   r   s    rS   r  z7PythonWrapperCodegen.codegen_input_size_and_nan_asserts  s1    ++-**, rU   c                   | j                          d| }t        j                  j                  r=| j                  j                  | d| d       t        j                  j                  r|S | j                  | d| d       |S )Nstream = get_raw_stream(r   )	r  r    r   r   r   r   r3   rH   rU  )r   rN  r  r   s       rS   r  z)PythonWrapperCodegen.write_get_raw_stream  s    ((*
|$==11&&00&*:,a8 ww""$1*Q?@rU   c                     | j                   d   S )N)r  r   s    rS   get_codegened_graphz(PythonWrapperCodegen.get_codegened_graph  s    ))"--rU   c                :    | j                   j                  |       y rG   )r  rg   )r   rH   s     rS   r&  z)PythonWrapperCodegen.push_codegened_graph  s    ""))%0rU   c                6    | j                   j                         S rG   )r  r  r   s    rS   rH  z(PythonWrapperCodegen.pop_codegened_graph  s    ))--//rU   c                P    ddl m} | j                  j                   ||            S )Nr   )deepcopy)copyr#  r  rg   )r   r#  r#  s      rS   r"  z(PythonWrapperCodegen.push_computed_sizes  s!    !((//0HIIrU   c                6    | j                   j                         S rG   )r  r  r   s    rS   rF  z'PythonWrapperCodegen.pop_computed_sizes  s    ((,,..rU   c                .    t        | j                         S rG   )nextr  r   s    rS   next_kernel_suffixz'PythonWrapperCodegen.next_kernel_suffix  s    t''()*rU   c                   | j                  t        || j                               t        j                  j
                  r| j                          | j                  j                  dt        j                  j                  j                  |       d       | j                  j                          t        |       r| j                          | j                  j                  d| d| d       || _        y )NrS  rT  r  r  r   )r   rM  rP  r    r   r   r  r   r3   rH   rW  rZ  r'  r,   r  )r   rN  s     rS   codegen_device_guard_enterz/PythonWrapperCodegen.codegen_device_guard_enter  s    )*d6W6WX	
 ==11))+&&00**77
CDAF &&002248002&&00$6zl!D -7)rU   c                    | j                  t                      t        j                  j                  r| j
                  j                          y y rG   )r   r`  r    r   r   r   rI  r   s    rS   codegen_device_guard_exitz.PythonWrapperCodegen.codegen_device_guard_exit  s6    356==11&&224 2rU   c                   |r,t         j                  r| j                  j                  ddj	                  |      z   dz          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                          | j                  j                  d       | j                  j                  d       | j                  j                  d       | j                  j                  d	dj	                  |      z   dz          y | j                  j                  d
       y )Nzreturn_vars = (r  , )zfor var in return_vars:z!if isinstance(var, torch.Tensor):z#assert not var.isnan().any().item()z#assert not var.isinf().any().item()r   zreturn (z	return ())r    r  r  r   r   r'  rI  )r   output_refss     rS   generate_returnz$PythonWrapperCodegen.generate_return  s   !!!!++%		+(>>F !!++,EF!!++-!!++,OP!!++-!!++,QR!!++,QR!!--a0''
TYY{5K(Ke(ST''4rU   c                     y rG   rv   r   results     rS   generate_before_suffixz+PythonWrapperCodegen.generate_before_suffix  r  rU   c                    t         j                  rNdj                  | j                        t	        | j                        dk(  rdndz   }|j                  d| d       y y )Nr  r4   r  r  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r    r	  r   all_partition_namesr   r   )r   r3  all_partition_name_lists      rS   generate_after_suffixz*PythonWrapperCodegen.generate_after_suffix  se    !!&*ii0H0H&I43349r'# MM--D,E F "rU   c                     y rG   rv   r2  s     rS   generate_endz!PythonWrapperCodegen.generate_end  r  rU   c                :    | j                  t        | |             y rG   )r   rf  r  s     rS   generate_fallback_kernelz-PythonWrapperCodegen.generate_fallback_kernel  s    ,T489rU   c                    |j                  |        | j                  t        | |             t        |j                  t
        j                        r|j                  |        y y rG   )codegen_commentr   rf  r_   r  r!   Layoutcodegen_size_assertsr  s     rS   generate_extern_kernel_allocz1PythonWrapperCodegen.generate_extern_kernel_alloc  sI    T",T489dkk299-%%d+ .rU   c           
        t        |j                  t        j                        }|j	                         }|j                         }|j                         }| j                  }t        j                  r	d|v rd| }|r5| j                  | j                   | ddj                  |       d|        y | j                  | j                   | d| ddj                  |       d|        | j                  rKt        j                  r:|7t        d   dxx   d	z  cc<   | j                  d
|j                   d| d       y y y y )Nview_as_complex.clone()r  r  r   r   inductorintermediate_hooksr4   zrun_intermediate_hooks()r_   r  r!   
NoneLayoutrJ   get_origin_noderu  rR  r    memory_planningr   rQ  r   r  generate_intermediate_hooksr   r   )r   extern_kernelrl  	no_returnr{  origin_noder}  rR  s           rS   rk  z9PythonWrapperCodegen._generate_extern_kernel_alloc_helper  s;    }33R]]C	#,,.#335#335!!&7;&F  x(FNNdll^K=$))D/9J!F8TUNN<<.SQtyy>OqQWPXY 0066+$%9:a?:-k.>.>-AK=PQR , 7 1rU   c                \    |j                  |        | j                  t        | |             y rG   )r>  r   rp  r  s     rS   generate_extern_kernel_outz/PythonWrapperCodegen.generate_extern_kernel_out4  s&     	T"*467rU   c                    t         j                  j                  j                  }|j	                  ||d d d       |j                  d|r|n|        |5  | j                  | ddj                  |       d       d d d        y # 1 sw Y   y xY w)Nexternzout=r  r  r   )r3   rH   wrapper_coder  set_printer_argsrg   r   r   )r   r   outout_viewrl  r  stack_tracesdebug_printer_managers           rS   ry  z7PythonWrapperCodegen._generate_extern_kernel_out_helper;  s     !" 4 4 B B..tVT4Rdx8S9:;" 	;NNfXQtyy&7q9:	; 	; 	;s   'BBc                    |j                   }|j                  }|r$t        d |D              }t        d |D              }|j                  j	                          d}dj                   fd|D              }dj                   fd|D              }t        j                   |j                        }d}| d|j                   d	}| d| d| d| }	| d
|	 d}
|
S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrG   r3   rH   rN   atomically_apply_size_hintrw   r~  s     rS   ry   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>O  s%     VA))DDQGV   35c              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrG   rZ  r\  s     rS   ry   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>P  s*      CD  ;;A>r]  z.data_ptr()r  c              3  J   K   | ]  }t         j                  |        y wrG   r   val_to_arg_strrw   dimr   s     rS   ry   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>V  s     XC-<<T3GX    #c              3  J   K   | ]  }t         j                  |        y wrG   r`  rb  s     rS   ry   zRPythonWrapperCodegen._generate_tma_descriptor_call_experimental.<locals>.<genexpr>W  s$      
?B //c:
rd  z$triton.tools.experimental_descriptorz.create_d_tma_descriptorr  r   )
dims
block_dimsr   tensorrz  r   r   ra  element_sizerank)r   descapply_size_hintsrg  rh  ptrrj  r  r   rl  r  s   `          rS   *_generate_tma_descriptor_call_experimentalz?PythonWrapperCodegen._generate_tma_descriptor_call_experimentalK  s    yy__
VQUVVD HR J ..01=yyXSWXXYY 
FP
 

 ,::4ARARS7xx		{*:;bbB|n=QtfArU   c                    |j                   }|rt        d |D              }d}| d}|j                  j                          d| }| d| d}|S )Nc              3  n   K   | ]-  }t         j                  j                  j                  |       / y wrG   rZ  r\  s     rS   ry   zLPythonWrapperCodegen._generate_tma_descriptor_call_stable.<locals>.<genexpr>d  s*       CD  ;;A> r]  z/triton.tools.tensor_descriptor.TensorDescriptorz.from_tensorr  r  r   )block_shaper   ri  rz  )r   rl  rm  rr  r  r   rl  r  s           rS   $_generate_tma_descriptor_call_stablez9PythonWrapperCodegen._generate_tma_descriptor_call_stablea  so    &&  HS  K Cx|$++//12"[MBQtfArU   c                    t        |t        j                        r| j                  ||      S t        |t        j                        sJ | j                  ||      S rG   )r_   r!   TMADescriptorExperimentalro  TMADescriptorStablers  )r   rl  rm  s      rS   _generate_tma_descriptor_callz2PythonWrapperCodegen._generate_tma_descriptor_calln  sW    dB889BB&  dB$:$:;;;<<TCSTTrU   c                    | j                  |      }|j                   d| | j                   }| j                  |       y Nr   )rw  r   rR  r   )r   rl  r  r   s       rS   generate_tma_descriptorz,PythonWrapperCodegen.generate_tma_descriptorw  s:    11$7))Cvdkk]3trU   c                :    | j                  t        | |             y rG   )r   rh  r  s     rS   generate_scatter_fallbackz.PythonWrapperCodegen.generate_scatter_fallback|  s    *467rU   c	                    | ddj                  t        t        |             }	|j                  d      r|	dj                  dg|z         z  }	n|r|	dt	        |       z  }	|	dz  }	| j                  |	       y )Nr  r  zaten.scatter_reducer  r  z	, reduce=r   )r   mapr   r   r  r   )
r   r   r`  rv  rp  rm  r  r   r  r   s
             rS   ro  z/PythonWrapperCodegen._generate_scatter_fallback  s{     %%QsxxC0@'A&BC(()>?DIIrdVm,,D)DL>22trU   c                `   g }|j                   dd  }t        |      }t        |j                        D ]^  \  }}|j                  |   9t	        |      }t        |t        j                        sJ |j                  |       N|j                  d        ` | j                  t        | ||             y )Nr   )r`  iter	enumeraterD  r'  r_   r!   r#   rg   r   rW  )r   rP   rD  valid_indicesiter_valid_indicesrN  _r  s           rS   generate_index_put_fallbackz0PythonWrapperCodegen.generate_index_put_fallback  s    -/AB!-0dll+ 	%DAq||A*/0!%333u%t$	% 	+D$@ArU   c                |    ddj                  |       d}||||g}| j                  | j                  ||             y )NrG  r  rH  )r   r   wrap_kernel_call)r   r   r   rD  rd  
accumulateindices_strrl  s           rS   rb  z1PythonWrapperCodegen._generate_index_put_fallback  sA    $))G,-Q/;
3t,,VT:;rU   c           
     `    | j                  | d| ddj                   |              d       y )Nr   r  r  r   )r   r   )r   buf_namerp  get_argsop_overloadr  r|  s          rS   ,generate_fallback_kernel_with_runtime_lookupzAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookup  s2     	(3'9&:!DIIhj<Q;RRSTUrU   c                f    t        d      5  | j                  |      cd d d        S # 1 sw Y   y xY w)NzPythonWrapperCodegen.generate)r   	_generater   is_inferences     rS   generatezPythonWrapperCodegen.generate  s,    9: 	0>>,/	0 	0 	0s   '0c                &    t         j                  ryy)Nr   r4   )r    r	  r   s    rS   get_wrapper_call_indentz,PythonWrapperCodegen.get_wrapper_call_indent  s    !!rU   c              #  b   K   | j                   }	 || _         | || _         y # || _         w xY wwrG   r   )r   newolds      rS   set_writelinez"PythonWrapperCodegen.set_writeline  s.     nn	! DNI DNSDNs   /# /	,/c                    | j                   j                  }t        j                  j                  r| j
                  j                  |       y | j                  j                  |       y rG   )r  kernel_defsr    r   r   r  r   r  )r   r  s     rS   _write_multi_kernel_defsz-PythonWrapperCodegen._write_multi_kernel_defs  sF    --99==11%%,,[9KK{+rU   c                	   t         j                  r| j                          t        j                         5 }|j                  | j                  j                                t         j                  r| j                  |       t         j                  r| j                          | j                  |       t         j                  j                  r*t         j                  j                  s| j                          | j!                  | j                  j"                        5  | j$                  D ]I  }t'        |t(              r|j+                  | j                         /| j                  j#                  |       K 	 d d d        | j-                          | j/                         }| j1                          t         j                  j2                  rA| j                  j#                  t4        j6                  j8                  j;                                t         j                  r| j=                          t         j                  j                  r*t         j                  j                  s| j?                          t         j                  j                  r| jA                          t         jB                  r+t         jD                  s| j                  j#                  d       | jG                  |       d d d        tI               }|jK                  | jL                         |j#                  d       |jK                  | jN                         t4        j6                  jP                  r>t4        j6                  jD                  r$t4        j6                  jR                  r
tI               }|jK                  | jT                         | jW                          |jK                  | jX                         | j[                         }|j                  |      5  |jK                  | j                         d d d        | j]                  |       |jK                  | j^                         | ja                  |       | jc                  |       | je                  |       |jg                         | jh                  jg                         fS # 1 sw Y   4xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nz+nvtx._device_range_end(training_annotation)r  )5r    profile_bandwidthr  r   	ExitStackenter_contextr  r   profiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphrun_wrapper_ir_passesr   store_cubinr   !generate_reset_kernel_saved_flagsr  r   r  r_   rE   r*  r  r  r  r  r3   rH   rW  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  rU  r0  r+   r   r  r  rV  is_const_graphr  finalize_prefixr  r  r4  r  r8  r:  add_benchmark_harnessgetvaluewithlinemapr  )r   r  stackr   r/  r3  wrapper_call_indents          rS   r  zPythonWrapperCodegen._generate  se   ##))+!!# +	.u 1 1 8 8 :;0088?''))+&&|4}}((1W1W668 ##D$5$5$?$?@ : JJ :D!$4T%6%67))33D9:: ))+..0K!!#}}--!!++AGG,>,>,J,J,LM'''')}}((1W1W557}}55446 ''0B0B!!++A   -W+	.\  !dll#dkk" 77 3 38N8N#%F 	d//0dkk""::<]]./ 	-MM$++,	- 	##F+dkk"""6*&!""6* &&($$88:
 	
w: :+	. +	.|	- 	-s2   C%R3AR&4E-R39S &R0	+R33R= S	c                \   | j                   j                  d       i }t        j                  j                  r_t
        j                  j                  rEt        t
        j                  j                        D ci c]  \  }}| j                  |      | }}}| j                   j                         dz   | j                  j                         z   t        j                  t        j                  k(  rkt!        j"                  t%               dd      5 }|j'                  j)                  d             |j*                  }ddd       t        j,                  d       t/        d	d
 fd       	 t1        |       yc c}}w # 1 sw Y   ExY w# t2        $ r}t5        d|       |d}~ww xY w)z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        rR  z.pyF)dirr  deletezutf-8NzAuto-tuning code written to %sartifactc                     dddS )N&inductor_autotune_at_compile_time_codestring)r   encodingrv   rv   rU   rS   r   zFPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>B  s    @$! rU   c                      S rG   rv   )tuning_codes   rS   r   zFPythonWrapperCodegen.generate_and_run_autotune_block.<locals>.<lambda>F  s    { rU   )metadata_fn
payload_fnz%Failed to run autotuning code block: )r  r   r    r   r   r3   rH   autotuning_inputsr  get_autotuning_input_namer   r   r"   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoder   debugr   exec	ExceptionRuntimeError)r   scopere  vf	file_pather  s          @rS   r  z4PythonWrapperCodegen.generate_and_run_autotune_block  s   
 	!!((	
 ==11agg6O6O ((A(ABC ..s3Q6E 
 %%..0((1134 	
   GMM1 ,,Ke #**734FF		#
 !!0 	 +	
	Se$?# #(  	S!FqcJKQRR	Ss*   9E<-F/F F	F+F&&F+c                \    ddl m}  ||       j                  | j                        | _        y )Nr4   )MemoryPlanner)rI  r  r  r  )r   r  s     rS   memory_planz PythonWrapperCodegen.memory_planN  s     2"4(--djj9
rU   c                   | j                         }t        j                  j                  |      }| j                  rt        | j                  d   t              r| j                  d   j                  j                  |vri| j                  j                          | j                  rCt        | j                  d   t              r&| j                  d   j                  j                  |vrit               g}g }t        t        | j                              D ]  }| j                  |   }t        |t              r"|j                  |d         | j                  |<   Dt        |t              r|j                  t                      nt        |t               s|j                  |j                                 |j                  |j                                t        |      dk(  sJ t#        d |D              }y )Nr  r   c              3  4   K   | ]  }|j                     y wrG   )r  )rw   ss     rS   ry   z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>q  s      +
./A))+
s   )r  r3   rH   _get_output_namesr  r_   r  rP   r   r  r   ranger   r  r  rg   rD  sum)r   r|  	out_namesplanning_statespast_planning_statesrN  r   _total_allocated_buffer_sizes           rS   memory_plan_reusez&PythonWrapperCodegen.memory_plan_reuseS  s   ((*GG--g6	 JJ4::b>+=>

2##((	9 JJNN JJ4::b>+=>

2##((	9 /01!s4::' 	CA::a=D$ 23 $		/"*= >

1D"34&&':'<=D"23$++O,?,?,AB	C 	##O$7$7$9:?#q(((
 (+ +
3G+
 (
$rU   c                    |r!t         j                  r| j                          y t         j                  rt	               | _        | j                          y rG   )r    rI  r  r  r  r  r  r  s     rS   r  z*PythonWrapperCodegen.run_wrapper_ir_passesu  s9    F22((%:%<"""$rU   c           	        	 | j                   	t        j                  	fd       }t        j                  	fd       }t        |t        j
                        rGt        |t        j                        r||v ry 	j                  | d|        |j                  |       y t        |t        j                        rt        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X t        |j                               D ]V  \  }}t        |t        j                        s!||vs&	j                  | d ||       d| d       |j                  |       X y t        |t        j                        ry t        |t        j                        ry t         j"                  j$                  j&                  ry t)        dt+        |             )Nc                <    j                  |  d|  d       |  dS )Nz_size = z.size()_sizer  r   r)  s    rS   sizeofzDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeof  s(    NNdV8D69:V5>!rU   c                <    j                  |  d|  d       |  dS )Nz
_stride = z	.stride()_strider  r  s    rS   strideofzFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideof  s)    NNdV:dV9=>V7##rU   r   rG  rH  zUnknown value type: )r  r  r  r_   rp   r   Symbolr   r   r!   re   r  r,  r-  r  r  r  r  r    r	  r  r}   )
r   r   r   
bound_varsr  r  rc  rb   rc   r)  s
            @rS   codegen_input_symbol_assignmentz4PythonWrapperCodegen.codegen_input_symbol_assignment~  s    {{		" 
	" 
	$ 
	$ eUZZ(eU\\2ez6INNeWCv./NN5!r||,&u~~'78 )	TdELL1d*6LNNdV3vd|nAcU!#DENN4()  ))9)9);< +Vfell3j8PNNfXS$0@#a#HINN6*+ r112r001%%55$';DK=%IJJrU   c           	        t        t        j                            }| j                         }|j	                         D cg c]$  \  }}t        |t        j                        s!||f& c}}|j	                         D cg c]$  \  }}t        |t        j                        r!||f& c}}z   }|D ]  \  }}| j                  |||        	 	 	 	 dd}|D ])  \  }	}t        |t        j                        s! |||       + yc c}}w c c}}w )z$Assign all symbolic shapes to localsc                P   t        j                  | j                         | j                         g      D ]k  }t	        |t
              rt	        |t        j                        r.|j                  D cg c]	  }||vs| }}t        |      dkD  s[t        d| d| d       y c c}w )Nr   zFor z, expected z to have been codegen-ed.)r   from_iterabler,  r-  r_   r   rp   r  free_symbolsr   r  )r   r  exprsymundefined_symbolss        rS   _verify_input_symbol_assignmentzLPythonWrapperCodegen.codegen_inputs.<locals>._verify_input_symbol_assignment  s     ++U^^-=u?O?O?Q,RS 
!$-D%,,1O $(#4#4%:8MC%! % ()A-(tfK0A/BB[\ 
%s   0	B#:B#N)r   ir.TensorBoxr  OrderedSet[sympy.Symbol])	r   rp   r  r  r  r_   r  r!   re   )
r   r  r  kr  r`  r   r   r  r  s
             rS   r  z#PythonWrapperCodegen.codegen_inputs  s    -/
 ,,.+113
q!z!U\\7RQF
 , 2 2 4X1Jq%,,<WaVXY " 	JKD%00ujI	J		0	&  	?HAueR\\2+E:>	?3
Xs   "D"D>"D!Dc                ~   t        |t        j                        rt        |t        j
                        r|| j                  v ry | j                  j                  |       t        j                  j                  j                  |   }t        ||      }| j                  t        | |t        j                               y y y rG   )r_   rp   r  r   r   PRECOMPUTED_SIZEr#  r   r3   rH   rN   inv_precomputed_replacementsr   r   rs  )r   r  r  rt  s       rS   ensure_size_computedz)PythonWrapperCodegen.ensure_size_computed  s    c5<<(^CAVAV-Wd)))##C(77##@@ED!#t,CNN.tS!''BC .X(rU   c                     y rG   rv   r   s    rS   r  z$PythonWrapperCodegen.finalize_prefix  r  rU   rO   c                   t        d      )Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r  r   r   rO   s      rS   codegen_cpp_sizevarz(PythonWrapperCodegen.codegen_cpp_sizevar  s    UVVrU   c                   t        ||      S )Nr  )pexprr  s      rS   codegen_python_sizevarz+PythonWrapperCodegen.codegen_python_sizevar  s    Q**rU   c                $    | j                  |      S rG   )r  r  s     rS   codegen_sizevarz$PythonWrapperCodegen.codegen_sizevar  s    **1--rU   c                    | d| dS )NrG  rH  rv   )r   rL  r   r  s       rS   rJ  z)PythonWrapperCodegen.codegen_tuple_access  s    1UG1%%rU   c                    g t        | j                  |      }t        |      dk(  ryt        |      dk(  r	d|d    dS ddj                  |       dS )Nr   ()r4   r  r.  r  r   )r~  r  r   r   )r   r/  partss      rS   r   z/PythonWrapperCodegen.codegen_python_shape_tuple  s^    :#d1159:u:?u:?uQxj$$499U#$A&&rU   c                $    | j                  |      S rG   )r   )r   r/  s     rS   r3  z(PythonWrapperCodegen.codegen_shape_tuple  s    ..u55rU   c                    dj                  dj                  |t        |      t        |      | j	                  |      | j	                  |      g            g fS )Nzalloc_from_pool({})r  )formatr   r  r   r   )r   r   rd   r   r/  rc   s         rS   codegen_alloc_from_poolz,PythonWrapperCodegen.codegen_alloc_from_pool  s_     %++II&MJ33E:33F;

 
 
	rU   c                ~    t        |      \  }}}	}
} fd}|j                         }|xr ||	k(  }|r||k(  xr ||k(  }|
}n[||j                  j                  k(  xr4 ||j                  j                  k(  xr ||j                  j
                  k(  }|j                  }|r|||k7  r	d| d| dS | S  |||||||      S )Nc           	         
j                  |      }
j                  |      }
j                  |      }d|  d| d| d| d	}	|||k7  r	d|	 d| dS |	S )Nzreinterpret_tensor(r  r   aten.view.dtype()r   r  )r   tgt_size
tgt_stride
tgt_offset
cast_dtype
base_dtyper  stoffr  r   s             rS   apply_reinterpretzHPythonWrapperCodegen.codegen_reinterpret_view.<locals>.apply_reinterpret"  s|     //9A00<B&&z2C(b2bTC5BD%*
*B)$r*Q??KrU   r  r  r   )rm   rJ   r  rb   rc   rd   r   )r   rh   rb   rc   rd   r   r   d_sized_strided_offsetd_dtypecollapsibler  r   	collapsedsame_layoutr  s   `                rS   codegen_reinterpret_viewz-PythonWrapperCodegen.codegen_reinterpret_view  s    8 ,D1 	9(G[		 }}6Fh$6	&.?Vx-?K J ((( 1dkk0001dkk000 
 J Uj%8)$r%::V tVVUJOOrU   c                8    | j                  | d| d| d       y )Nz.copy_(r  r   r  )r   r   dstnon_blockings       rS   codegen_device_copyz(PythonWrapperCodegen.codegen_device_copyA  s!    #gcU"\N!<=rU   c                    |j                         }|j                  d      }| j                  t        | |||j                               y r   )rJ   
input_namer   r@  rD  )r   rP   rA  rB  s       rS   codegen_multi_outputz)PythonWrapperCodegen.codegen_multi_outputD  s6    mmo??1%t[(DLLQRrU   c           
     j   |j                    d|j                   d|j                    d|j                    }|rd|j                   d| d}| j                  |j                   d|j                   d|j
                   d| d	       | j                  j                  t        |j                               y )
N +  if z
 < 0 else zmax(0, min(r  z))r   z * (r   )	r  rb   r   unbacked_offset_symbolbase_offsetbase_dim_strider  r   r   )r   rP   clamp	index_strs       rS   codegen_dynamic_select_indexz1PythonWrapperCodegen.codegen_dynamic_select_indexI  s    zzl#dii[TZZL
4::,W	%dii[9+R@I**+3t/?/?.@DDXDXCYY]^g]hhij	
 	""&&s4+F+F'GHrU   c                     fd} fd}j                   } |j                        } |j                        } j                  | d|         j                  | d|         || d| dj                        } j                  | d| d        j
                  j                  t        j                                y )	Nc           	     `   j                  t        j                  dt        j                  | j                                    }j                  t        j                  dt        j                  | j                  z   j                                    }j                  |       }| d| d| S )Nr   r&  z >= 0 else )r  rp   MaxMinrb   )r   posnegx_condrP   r   s       rS   clamp_indexzDPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.clamp_indexT  s    &&uyyEIIa4K'LMC&&		!UYYq499}dii@AC ))!,FU$vhk#77rU   c                Z    |dk(  r| d|  S j                  |      }d| d|  d| d| S )Nr4   z - r  r%  z	 - 1) // )r  )	start_varend_varstepstep_r   s       rS   codegen_with_stepzJPythonWrapperCodegen.codegen_dynamic_slice_size.<locals>.codegen_with_step\  sJ    qy!#i[11((.Ewis9+SyHHrU   z	_start = z_end = _start_endz
 = max(0, r   )unbacked_size_symbolstartendr   r8  r  r   r   )r   rP   r4  r:  r  r>  r?  	with_steps   ``      rS   codegen_dynamic_slice_sizez/PythonWrapperCodegen.codegen_dynamic_slice_sizeS  s    	8	I ''DJJ'$((##iw/0#gcU+,%VnTlDIIN	#j156""&&s4+D+D'EFrU   c                :    | j                  t        | |             y rG   )r   r>  r  s     rS   codegen_dynamic_scalarz+PythonWrapperCodegen.codegen_dynamic_scalarl  s    (t45rU   c                   d |j                   D        \  }t        |j                        dk(  r#| j                  |j                   d| d       nkt        |j                        dk(  r@t        |j                  d   t              r#| j                  |j                   d| d       nt        |j                        dk(  rt        |j                  d   t              r| j                  |j                   d| d       | j                  d	|j                   d
|j                  d   j                   d|j                   d|j                  d   j                   d	       | j                  |j                   d|j                   d|j                  d   j                          nt        d|j                         | j                  |j                          d       y )Nc              3  <   K   | ]  }|j                           y wrG   r[  r\  s     rS   ry   z?PythonWrapperCodegen._codegen_dynamic_scalar.<locals>.<genexpr>p  s     >Q1&&(>r^  r   r   .item()r4   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath  = None)r`  r   keypathr   r  r_   r   r   divisorr  rJ   )r   rP   rh   s      rS   r@  z,PythonWrapperCodegen._codegen_dynamic_scalaro  s   >$++>t||!NNdhhZs4&89!#
4<<?M(RNNdhhZxv^DE!#
4<<?K(PNNdhhZ}TF'BCNN$((=a1H1H0I Jxxj >t||A?V?V>WWXZ NN88*CzQ8O8O7PQ !#8!GHH 	$--/*'23rU   c           
     0     fd}fd}fd}j                  g d       j                         5  j                  dd       t        j                  j
                  j                         D ]U  \  }}j                  d|         |||j                         |j                         |j                  |j                         W t        t        j                  j                        d	kD  r^j                  d
       t        j                  j                  j                         D ]"  \  }}j                  d|         |||       $ t        j                  j                  j                         D ]  \  }}t        |t         j"                        rCt        t        j                  j$                  j&                  j)                  |d       t*              rdt        |t,        j.                        rct        t        j                  j                        d	k(  rj                  d
       j                  d|         |||j1                                t        |t         j2                        r4 ||t        j                  j$                  j5                  |d             /t        |t,        j6                        r# ||d|j                  j8                   d       l|j;                         D cg c]-  }t        j                  j$                  j5                  |d      / }	}|j=                         D cg c]-  }t        j                  j$                  j5                  |d      / }
} |||	|
|j?                         |jA                                " ddjC                  t        j                  j                  jE                                d}j                  d|        j                  d       d d d        y c c}w c c}w # 1 sw Y   y xY w)Nc                    j                  |  dj                  |       dj                  |       d| d| d
       y )Nz = rand_strided(r  
, device='	', dtype=r   )r   r   )r   r/  rc   r  r   r   r   s        rS   add_fake_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_input  sT    &(2259:"226:; <!()E7!5rU   c                2    j                  |  d|        y ry  r  )r   r  r   s     rS   add_expr_inputzFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_input  s    vS./rU   c                    |j                  |  d       y dd l}t        |t        j                        sJ j                  |  d|j                  |      d       y )NrI  r   z = pickle.loads(r   )r   pickler_   r  ScriptObjectdumps)r   r   rT  r   s      rS   add_torchbind_inputzKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input  s_    }  D6!12eU%7%7888v%5fll56I5LANOrU   )r  r  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   r  ztorch.cuda.default_generators[z].graphsafe_get_state()zcall([r  ])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r3   rH   	constantsr  r   rb   rc   r  r   r   torchbind_constantsr  r_   rp   r  rN   
var_to_valr   r   r!   r  get_real_objr   r  r  r  r,  r-  rw  rM   r   r  )r   r   rP  rR  rW  r   r   torchbind_objr   r/  rc   call_strs   ``          rS   benchmark_compiled_modulez.PythonWrapperCodegen.benchmark_compiled_module  sy   		0		P 	K	
 ]]_ E	YMM     !ww00668 e   74&!12%**,ekk	 177../!3  1+,77+F+F+L+L+N ='D- $$wtf%56'm<	=  !ww3399; (eeU\\2zGG$$//33E4@,8 eR%7%781776671<((9$$wtf%56'e.@.@.BCuzz2
 #4)9)9)C)CETV)C)WXr'8'89"89K9K8LLcd "'!1 ((221r2BE  "'!1!1!3 ((221r2BF  #((*)E(T  		!''*>*>*C*C*E FGrJH}XJ78WXKE	Y E	YfoE	Y E	Ys+   J>P42P&P:2P,BP
PPc                    t         j                  sy| j                  |       |j                  g d       |j	                         5  |j                  ddt                dg       ddd       y# 1 sw Y   yxY w)zL
        Append a benchmark harness to generated code for debugging
        N)r  r  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r    benchmark_harnessra  rZ  r   r)   r   r   s     rS   r  z*PythonWrapperCodegen.add_benchmark_harness  ss     ''&&v.@A]]_ 	X,-?-A,BB_`	 	 	s    A//A8c           
     D    | j                  t        | |||||             y r  )r   r  )r   r}  r  r  r  r  s         rS   define_kernelz"PythonWrapperCodegen.define_kernel  s*     	 !-		
rU   c                    t         j                  j                  r)|r't        j                  dd|t        j
                        }|r| dnd}d| |  d| }|S )Nz^// z# )flagsrR  r  z

r   )r    r   r   resub	MULTILINE)r}  r  r  metadata_commentbodys        rS   _format_kernel_definitionz.PythonWrapperCodegen._format_kernel_definition
	  sZ     ==11h vvgtXR\\JH.6hZr?B&'}C}ErU   c                .   t         j                  j                  rL|rJ| j                  |||      }| j                  j                  |       t        j                  j                  ry | j                  |||      }| j                  j                  |       y )N)r  )
r    r   r   rn  r  r   r3   rH   rU  r  )r   r}  r  r  r  r  rm  s          rS   r  z*PythonWrapperCodegen._define_kernel_helper	  s     ==11c11[8 2 D %%,,T2ww""--x . 
 	4 rU   c                N    | j                   j                  |j                         y rG   )r  r   r   )r   r   subgraph_codes      rS   define_subgraph_launcher_fnz0PythonWrapperCodegen.define_subgraph_launcher_fn,	  s    !!(()<)<=rU   c                2  /01234 ddl m}m}m}	 ddlm/m}
m}m}m	} ddl
m}m} |j                  }g 4i 2g 1g }14fd0d-/02fd	}|j                  D cg c]  }|j                   }}|j                  D cg c]  }|j                   s|j"                   }}t%        |      D ]  \  }}||v r || /|      d	
       |vr#|   }|    || /|      d	       @t'        |t(        j*                        r[t'        |t(        j,                        r'd|j.                  |j0                  j3                         fnd\  }}} || |||||             t'        |t(        j4                        r/ || |||j7                         |j3                                      t'        |t(        j8                        rO || |||j:                  j7                         |j3                         |j<                  j>                               gt'        |t@        tB        jD                  f      xr* tF        jH                  jJ                  jM                  |d      } || |||      |        tO        4d 1|jP                  D cg c]  }tS        |       c}      }|tU        jV                  tF        jH                  jY                               i 2tZ        j]                  |d      t_        41      gd}|rta        |      |d<   |rta        |      |d<   tc        |      dk(  r0|je                         } g tg        tB        jh                  |d         }!nd.3fd}"i 3|D #cg c]  }#g tg        |"|#       }}#|rtc        |      tc        |      k(  sJ g }$tk        tm        ||      d d	      D ]Q  \  }#}%|$jo                   ||%      g tg        tp        |#      g tg        tr        |#      g tg        tp        |#      d       S |	j                  |$g tg        tt        3jw                               d} g 3jy                         }!t{        |j|                        g}&tc        |      dkD  rQjw                         D ]>  }t'        |t(        j4                  t(        j8                  f      r.|&jo                  |       @ |&jo                  tu        |             |&j                  tu        |              ta        |&      }&|&| j                  v rg | j                  |&   |!S | dtc        | j                         }'t               }(t        j                  j                  r|(j                  d|'d       n|(j                  d|d       |'| d <   | j                  |j                                |(j                   |              |(j                  d!g tg        ||      d"| d#|d$       t        |      })t        j                  j                  r|)j                  d%| d&d%|' d&      })|)j                  d'd(      })|(j                  |)       tF        jH                  jY                         }*|(j                  d)|*j                   d*       t        j                  |j|                        \  }+},t        j                  |j|                        }-d+|- d,|, }.| j                  |'|(j                         |.       |'|f| j                  |&<   |'||!fS c c}w c c}w c c}w c c}#w )/Nr   )config_to_dict	FixedGridPrecomputedGridr4   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                J    j                  |       j                  |        y rG   )rg   )re  rt  arg_indices	signatures     rS   add_to_signaturezPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signatureL	  s    S!s#rU   c                   |r?t               r	 | |       |j                  v r|j                     |j                  <   y y |j                  v sJ |r>t               r |  |j                               n	 | |       d|j                  <   y |r4t               r |  |j                               d |j                  <   y  | |       y )Nr   r4   )r2   r   )	re  rt  is_constexprequals_1equals_nonerw  r  r[  r   s	        rS   add_argzGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_argP	  s    13 %S#.88v% +1*:Ichh' & xx6)))57
 )l.IJ(c2*+Ichh' 57 )l.IJ*.Ichh'$S#.rU   r  T)r  )r  stable)experimentalNN)r   api_typerr  r   )r   bufferr   )r   r  r   rd   )r  )
size_dtyperD  argdefs)rD  )r  r  r[  r   restore_valuereset_to_zeror   c                N   t        | t        j                        rdg | j                  }|s| S |j	                  t
               |D ]+  }|v rt        j                  dt                     |<   - t        |       S t        | t              sJ t        j                  |       S )N)r   _launcher_s)r_   rp   r   r  sortr   r  r   r1   r   rq   )r  symbolsr  extra_launcher_argss      rS   rename_sizes_for_launcherzYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcher	  s    dEJJ/2 1 12G"#LLSL)& "55$38<<)#.A*B)CD4+C0 &d,?@@!$,,,}}T**rU   c                2    t        | d   j                        S r   r   r   s    rS   r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>	  s    3qt{{3C rU   r   )r    pythoncpppython_slow)	grid_typeprecomputed_gridsr  r  zasync_compile.triton(z, '''r}  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   r  z'''z\'\'\'z''', device_str='r  z# Original path: rT  )FFF)r  r   r   r   )Pruntime.triton_heuristicsrt  ru  rv  commonrw  rx  ry  rz  r{  r   r|  r}  r   paramsr   r  numr  r_   r!   TMADescriptorrv  rr  ri  rM   r`   rJ   r$   rh   r  rd   r   rp   rq   r3   rH   rN   statically_known_equalsr>   	arg_namesr5   r&   r  get_current_device_or_throwrK  fromkeysr<   r   r   setup_grid_as_argsr~  sympifyr   r   rg   r  r;   r   rd  r  idr   extendr  r+   r    unique_user_kernel_namesr   updateinductor_meta_commonr   r   replacer}   inspectgetsourcelinesgetsourcefilerf  r   )5r   r   r   r   restore_value_argsreset_to_zero_argsr   rt  ru  rv  rx  ry  rz  r{  r|  r}  original_nameequal_to_1_argsr  pr  
constexprsre  r   rt  r  rr  r   r  r   triton_signaturer  inductor_metaextra_launcher_call_argsr  r   r  cfg	cache_keyr   r   
kernel_srccurrent_devicer  linenosrcfiler  rw  r  r  r[  r  r  s5      `                                           @@@@@@rS   !define_user_defined_triton_kernelz6PythonWrapperCodegen.define_user_defined_triton_kernel/	  s   	
 	

	
 	
 	D)+	$&	!#%'	$"	/ "	/H &,]]3QVV3	3%+]]EannaeeE
E!), 9	GHCj \s3$G& +Cc{"\s3Fc2#3#34 &c2+A+AB "3??CJJ4H4H4JK9 1Hk5
 (!$%-(3"'	  RYY/!!$#&<<>"%--/  R%7%78 !!$#&88#4#4#6"%--/#&::#4#4	  *c5==1   ''**BB  Cc!2XFs9	Gv -)/)9)9:AWQZ:	
 *&--agg.Q.Q.ST--3
 ''
, +01C+DK(+01C+DK(u:?,5,H,H,JM'FU]]E!H)E'F$+  EGINO<s4d;<OEOSZ3w<777 "#E7#)CT 
	c "(("0"5"5Ct$4"52UD!12':UD)9':	
 -55%6'PS2E2L2L2N)O'PM
 (E)<)A)A)C'D$ VYY-	w<!}} *!#		23E3E'FG$$S)* 	[)*]+,)$	666//	:( 
  #d&D&D"E!FG(*==11%%(=dXU&KL%%(=m=Ne&TU'+m$\>>@A8:;83~w78; <,/ 0(O ,			
 OvV
==11#++d=/,CtD6QR^TJ''{;
z*<<>!!$5n6I6I5J""MN**6995	6''		2&wiq9$$&	
 6:;4G&&y1[":::O 4E@ ;j Ps   \<\
\
\%\c                    | d|j                    d}||d| z  }t        j                  |dd      }t        ||j                        }|dk(  }|s*| j                  t        | |t        j                               |S )Nr  r  T)
is_integeris_positiver  )	r  rp   r  r   r  r   rs  r3   rH   )r   r}  treer  sym_namer  rt  is_benchmark_kernels           rS   generate_numel_exprz(PythonWrapperCodegen.generate_numel_expr=
  s    !]!DKK=6!F8$Hll8$G c4::.)R/"NN.tS!''BC
rU   c                j    | j                  |j                   dt        |j                                y ry  )r   r   r  r   )r   rt  rH   s      rS   rv  z7PythonWrapperCodegen._generate_symbolic_call_arg_helperR
  s)     	#))Ccnn(='>?@rU   c                   |j                         }t        | |      }|j                  t        j                  k(  r| j                  |       n1|j                  t        j                  k(  r2| j                  |       | j                  | j                  |             n|j                  t        j                  k(  r| j                  j                  |      }|rRt        |t              rt        |j                  t              sJ t        j                  |j                  |      |_        nV| j                  |       | j                  | j                  |             || j                  |<   nt        |j                        t         j"                  j$                  r| j&                  j                  t(        j+                  | ||j,                  |j.                  t0        j2                  j4                  j7                  |j8                        fd             |j                  t        j                  k7  r0| j&                  j                  t(        j                  | |             y y y )N)r4   )r/  rc   )rJ   r  	zero_moder:   UNINITIALIZEDr   ZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr  r   r_   rP   r9   maximumr  r    r   r   r   r   make_allocationr  r   r3   rH   rN   r  r   )r   wsr   r   priors        rS   generate_workspace_allocationz2PythonWrapperCodegen.generate_workspace_allocationW
  s   {{}D"%<<,:::NN4 \\.;;;NN4 NN40067\\.===--11$7E!%6:JJ<   *11%**bA
t$t44T:;26))$/ ..==11&&00$44IIHH77++55bhh?A 5 	 ||0>>>**44(99$E ? 2rU   c                v    |j                   t        j                  k7  r| j                  t	        | |             y y rG   )r  r:   r  r   r  )r   r  s     rS   generate_workspace_deallocationz4PythonWrapperCodegen.generate_workspace_deallocation~
  s.    <<,;;;NN.tR89 <rU   c                $    | d| j                    S )Nz.zero_())rR  )r   r   s     rS   r  z%PythonWrapperCodegen.make_zero_buffer
  s    x}--rU   c                H    | ddj                  |       d| j                   S )Nr  r  r   )r   rR  )r   r   r  s      rS   r  z%PythonWrapperCodegen.wrap_kernel_call
  s'    q9-.a}==rU   c                    | j                   j                  d       | j                   j                  dt        j                  j                   d       |j                  | j                   j                                y )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)r  r   r3   rH   graph_idr  r   )r   r  s     rS   r  z8PythonWrapperCodegen.generate_profiler_mark_wrapper_call
  sb    ##$PQ##*177+;+;*<<UV	
 	D--4467rU   c                :    | j                   j                  d       y )Nzstart_graph())r  r   r   s    rS   r  z)PythonWrapperCodegen.generate_start_graph
  s    ##O4rU   c                ^    | j                   j                  dt        j                  d       y )Nz
end_graph(r   )r  r   r    profile_bandwidth_outputr   s    rS   r  z'PythonWrapperCodegen.generate_end_graph
  s'    ##j1P1P0SST$UVrU   c                ^    | j                   j                  dt        j                   d       y )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            r  r   r%   r   r   s    rS   r  z6PythonWrapperCodegen.generate_reset_kernel_saved_flags
  s2      ''8'A'A&B C	
rU   c                ^    | j                   j                  dt        j                   d       y)a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   s    rS   r  z5PythonWrapperCodegen.generate_save_uncompiled_kernels
  s4     	  ''8'A'A&B C
	
rU   c                >    d }|D cg c]
  } ||       c}S c c}w )Nc                    t        | t              rt        |       r| dz   S | S t        | t        t        t
        t        f      rt        |       S t        t        j                  j                  j                  |             S )NrF  )r_   r   r=   r   floatr   r   r  r3   rH   rN   rO   )rt  s    rS   wrap_argzAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_arg
  s^    #s#*B3*GsYPSPC#udO!DE3xQWW--66s;<<rU   rv   )r   r  r  rt  s       rS   prepare_triton_kernel_callz/PythonWrapperCodegen.prepare_triton_kernel_call
  s!    	= *33#333s   c                &    t        |t              rt        |t        j                        r.|j	                         j                         } j                  |   }n\ j                  j                  |      r|} j                  |   }n/|J d       d j                   }|} xj                  dz  c_        |
J d|        t        d |j                         D              }t        d t        j                  j                  |      D              }t        d |j                         D              }|j                         }	|j!                         }
t        j                  j"                  j%                  |j'                         j(                  t*        j,                        }d	| d
| d|	 d|
 d
| d
| d} j.                  j1                  | d|        t        |t        j                        r5 j3                  |d      }|} j.                  j1                  | d|        |S t5        |t6        j8                        st        |t:              rt        |t<              r| j>                  v r|S |y|}t        |t:              r|j@                  }|t        j                  j"                  jB                  v r't        j                  j"                  jB                  |   }t=        t        j                  j"                  jE                  |t*        j,                              S t        |t<        tF        tH        tJ        f      rt=        |      S t        |tL              rdd
jO                   fd|D               dS tQ        dtS        |             )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timetmp_arg_r4   z Failed to find a buffer for arg c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  Nr3   rH   rN   r[  r    unbacked_symint_fallbackrw   r  s     rS   ry   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>
  s@      
 	   ;;#<< <    AAc              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  r  r  s     rS   ry   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>
  s@      $
 	   ;;#<< < $r  c              3     K   | ]=  }t         j                  j                  j                  |t        j
                          ? ywr  r  r  s     rS   ry   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>
  s@      
 	   ;;#<< < r  r  zgenerate_example_value(r  z, 'z', r   r   T)rl  rm  r  rG  c              3  T   K   | ]  }j                  |t        |             ! y wrG   r{   )rw   ar   s     rS   ry   zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>  s#      ZQR!@!@DG!L Zr   rH  zUnsupported type )*r_   torch_dtyper!   r  
get_tensorrJ   r  r   r  r   r,  r3   rH   get_allocation_sizer-  rw  rM   rN   r  ra   rd   r    r  r   r   rw  rI  rp   Basicr   r   r  r   r  r[  r   r  r   r  r   r  r}   )r   rt  arg_typeraw_argr  r  rb   allocation_sizerc   r  r   rd   r   s   `            rS   r|   z/PythonWrapperCodegen.generate_example_arg_value
  sF   h,'2#3#34"--/88:**3/%%))#.**3/* X* &d&F&F%GH00A50?L&Fse$LL? 
  D $ $
 44S9$ O  
 ) F ^^%FMMOEWW%%// ''88 0 F .dV2fXSE7RTU[T\\^_n^oopqE&&00H:S1HI'2#3#34 :: %) ;  **44zUG5LMO%++.*S/2R#s#$//)J?!#/nnagg&&CCCgg&&CCCH  ;;&"A"A <   c3t45s8OT"tyy ZVY ZZ[[\]]%(9$s)&EFFrU   c                z     t        |t              r ddj                   fd|D              z   dz   S t        |      S )NrG  r  c              3  @   K   | ]  }j                  |        y wrG   )_grid_dim_str)rw   rr   r   s     rS   ry   z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>  s     RT 2 24 8Rs   rH  )r_   r  r   r  )r   grid_per_dims   ` rS   r  z"PythonWrapperCodegen._grid_dim_str  s<    lD)diiR\RRRUXX &&rU   )r  r   r  r  r  r  r   c               z   | j                   j                  |D 
ci c]2  }
t        |
t              r |
t        j
                  j                  |
      4 c}
       |xs t        j
                  j                         }| j                  t        | ||||||||t        j
                  j                  |	             yc c}
w )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        )
r}  r  r  r  r  r   r  r  r  r   N)r  r  r_   r   r3   rH   try_get_bufferr  r   r  r   )r   r}  r  r  r   r  r  r  r  r   rt  s              rS   generate_kernel_callz)PythonWrapperCodegen.generate_kernel_call   s    , 	## %c3' QWW++C00	
 @177>>@'#!!#'77<<%9!	
s   7B8r  )r  r   r  r  r  r  r  r   c          
     "
    |xs t         j                  j                         }|s|j                  dk7  r~|j                  dk(  r" j	                   j                  |             y |j                  dk(  r% j	                   j                   d|             y t        d|j                   d       j                  |      }dj                  |      }t        j                   |j                  |	      }|s$d| d	} j	                   d
 d| d| d	       y  j                          t        j                  j                  rN j                   vr?|t#        |      t#        |      k(  sJ d       d |
rDt         j                  j$                  r*t         j                  j$                  j'                  |
d       d fd} fd}g }|(|J d       d gt#        |      z  }d gt#        |      z  }nt#        |      t#        |      k(  sJ d       i }t)        t+        ||||            D ]P  \  }\  }}}}d }t-        |t.              r!dt/        |      v r|j1                  d      \  }}d }r|v r j3                  |         }|rB|}t-        |t4              st7        |t8        j:                        st-        |t<              r|||<   n|dk(  r |||||      r||   }nt-        |t4              r_t?        j@                  d|      r|}n4| jB                  vr jE                  |||      }n jB                  |   d   }|f jB                  |<   n jE                  |||      }|jG                  ||n| d|        S  jH                  j	                  dt         j                  jJ                  jM                  |j                         d        jH                  jO                           jH                  j	                   ddj                  |       d| d	        jH                  jQ                           jH                  j	                  tS        d|d              j                   jU                         t         j                  jV                  ry t         j                  jX                  jZ                  }|j]                  ||d        |5   j	                   d| d| d	       d d d         j                          y # 1 sw Y   xY w)Ncudar  mpsz.generated_kernelzdevice z nyir  z	c_void_p(r   r   r  z$call_args and arg_types do not matchc                     j                   j                         D  cg c]  \  } }|k(  r|  }} }|rddj                  |       dS yc c}} w )a  After all the autotune kernel calls have been written (i.e.
                self.kernel_autotune_example_args is complete), returns a deletion call
                for all autotune example tensors that are unnecessary after kernel_name
                is called.del r  rR  r  )r  rd  r   )ri  kntensors_to_deleter}  r   s      rS   get_autotune_deletion_callzUPythonWrapperCodegen._generate_kernel_call_helper.<locals>.get_autotune_deletion_call  se     '+&G&G&N&N&P%"[( %! %
 %!$)),=">!?rBB%s   Ac                j   ||   }||v ryt        t        | |            D ]  \  }\  }}||k(  st        |t              sd}r|v rj	                  |         }|dk(  rA	 |j                         }	t        |	j                        D ]  \  }
}||k(  s| d|
 d||<     y  y# t        $ r Y w xY w)zWe try to infer raw_arg (i.e. raw_args[idx]) from remaining raw_args.
                This is particularly useful for jagged cases, where the dimension is often
                being passed in as an input.Tr  z.shape[rH  F)r  r   r_   r#   r  ra   rb   r  )r  r  re  reused_args
target_argrN  raw_keyr  triton_inputr  rc  r  autotune_argsr   s               rS   infer_arg_by_inputszNPythonWrapperCodegen._generate_kernel_call_helper.<locals>.infer_arg_by_inputs  s    
 &c]
,-6s8X7N-O !)A)Cxz'6'B #%L$M)A'+'E'E)'2( $r) 	!!(!3!3!5&/&< ,FC J=IN'RUQVVW:XJ 7'+,!, 	 / ! !!s   #0B&B&"B&&	B21B2zkeys are not None but args arez#call_args and raw_args do not matchr  r  z^(workspace|semaphore)r   rS  rT  z.run(z	, stream=z
<del_call>r  )/r3   rH   r  r}   r   r  r  r  r   r   r  r  r  r    r   r   r   r   autotuning_mappingr   r  r   r_   r   splitr  r  rI  rp   r  r   ri  matchr  r|   rg   r   rW  rZ  r'  rI  r(   r   rU  rR  r  rS  )r   r}  r  r  r   r  r  r  r  r  r   call_args_strstream_name
stream_ptrr  r  all_argsr  rN  rt  r  r
  r  r   r  arg_strrW  r  s   ``                         @rS   r  z1PythonWrapperCodegen._generate_kernel_call_helperT  s    @177>>@&++/{{e#t44[)LM  %))[M9J*KYW
  #WV[[M#>?? 77	B		-0*??&,,

 $[M3JNN-qQ}oR
|1M %%' MM224#=#== (S^s9~-M 6M !M#(B(B ! : : > >($!B H'I)II' 6C	N2 6C	N28}I6 96 K8AIy(H=9 *P44C7G c3'C3s8O"yy~HC.2 W%=#'#A#A%g.$L  *G%h<"8U[[9%c?;/6G,]':h;(
 *'2G+6 xx 93?"%D$E$EE"&"A"A7# #'"C"CC"H"K>E{=SD55c:"==c8WUG3;se1WI<NOU*PZ &&00**77EFaH &&002&&00-uTYYx%8$9;-qQ &&224&&00 /I<X &&**;7ww"" !" 4 4 B B..y+yRVW" 	XNNk]%i}TUVW	X%%'	X 	Xs   TTc                :    | j                   j                  |       y rG   )r  rg   r   r   s     rS   r   zPythonWrapperCodegen.writeline  s    

$rU   c                4    |D ]  }| j                  |        y rG   r  )r   r  r   s      rS   rZ  zPythonWrapperCodegen.writelines  s     	!DNN4 	!rU   c                L    | j                   j                  t        |             y rG   )r  rg   r.   )r   ctxs     rS   r  z"PythonWrapperCodegen.enter_context  s    

+c*+rU   c                    ddl m}  |       rdd l}t        |t              rt        |j                  j                        S t        |t        j                        rt        |      S t        |t        t        f      rAt        j                   G d d             t         t        |       fd|D                    S t        |t         j"                  j$                        rt'        |      S t        |t(        j*                  t(        j,                  t.        f      r|j1                         S  |       r+t        |j2                  j4                        rt        |      S t        |t(        j6                        r|j1                         S t9        t        |            r=t        |      }|t:        j<                  j>                  |j@                  <   t        |      S t        |      S )Nr   )has_triton_packagec                      e Zd ZU ded<   d Zy)1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                    | j                   S rG   )r  r   s    rS   __repr__z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__&  s    88OrU   N)r   r   r   r   r!  rv   rU   rS   Shimr  "  s    $rU   r"  c              3  V   K   | ]   } t         j                  |             " y wrG   r`  )rw   r  r"  r   s     rS   ry   z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>+  s$     Vq1@@qIJVs   &))!torch.utils._tritonr  r   r_   r   r  rP   r  rp   r   r   r  r  	dataclassr  r}   r  _ops
OpOverloadr   r!   r`   
MutableBoxr$   rz  languager   r  r   r3   rH   opaque_value_type_classesr   )r   r  type_r  r   opaque_typer"  s   `     @rS   ra  z#PythonWrapperCodegen.val_to_arg_str  s]   :a"%%5::&8OE4=)""$ $ #$ QVTUVV  5::001&q))BIIr}}oFG&&((!jFOO4I4I&J7N2,,-&&((!$q'*q'KFQAGG--k.B.BC7N7NrU   c           	     `   |j                         }|j                         }t        |j                               }t        t        j
                  j                  |            }t        |j                               }|j                         }| j                  |j                         ||||||      S rG   )rw  rM   r   r,  r3   rH   r  r-  get_is_pinnedr  rJ   )r   r  r  r   r/  allocation_shaperc   	is_pinneds           rS   r  z+PythonWrapperCodegen.make_buffer_allocation=  s    ""$  "foo'( !<!<V!DEv((*+((*	##OOvueV=My
 	
rU   c                x    d}t         j                  j                  s| j                  j	                  |d       y y )Nzi
            from torch._inductor.runtime.debug_utils import check_memory_step, track_tensor
            Tr   )r3   rH   rU  r  r   r  s     rS   "write_memory_track_allocation_oncez7PythonWrapperCodegen.write_memory_track_allocation_onceH  s4    
 ww""LL
$7 #rU   c                   ||}| j                  |      }| j                  |      }	| j                  |      }
t        j                  j                  j                  j
                  r| d|	 d|
 d| d|j                   d| d}ne|j                  dk(  r|r| d|	 d|
 d| d	}nE|j                  d
v r| d|j                   d|	 d|
 d| d	
}n| d|	 d|
 d|j                   d| d	
}||	k7  r|d| d|
 d	z   }|S )Nz = tracked_empty_strided(r  z, dtype=rN  z	', name='r  r  z = empty_strided_cpu_pinned(r   )r  r   xpumtiaz = empty_strided_r  z = empty_strided(rO  z.as_strided()r   r  r  r    r  r  r}   )r   r   r  r   r/  rc   r/  r0  r3  codegen_allocation_shape_tuplecodegen_stride_tuplerT  s               rS   r  z$PythonWrapperCodegen.make_allocationP  s    #$"==eD)-)H)H*
&  $>>vF??!!..EE&112"'( )  !;;- (b"  [[E!i&412"'('  [[:: &)&++a12"'('  &)12"'( )!;;-yq:  "@@,':&;2>R=SSTUUC
rU   c                8    | j                  t        |             y rG   )r   r9  r  s     rS   make_commentz!PythonWrapperCodegen.make_comment  s    {4()rU   c           	     `    | j                    | d| | j                   d| j                   d| 	S )Nr      )rQ  rR  r  )r   new_nameold_namer  s       rS   make_tensor_aliasz&PythonWrapperCodegen.make_tensor_alias  s6    ,,zXJt{{m2dll^STU\T]^^rU   c                (    d|j                          S )Nr  )rJ   )r   r  s     rS   r  z%PythonWrapperCodegen.make_buffer_free  s    foo'())rU   c                8    ddj                  d |D               S )Nr  r  c              3      K   | ]  }|  y wrG   rv   )rw   r   s     rS   ry   z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>  s     >>s   )r   )r   names_to_dels     rS   make_free_by_namesz'PythonWrapperCodegen.make_free_by_names  s    dii>>>?@@rU   c           	     `    | j                    | d| | | j                   d| j                   d	S )Nr   r;   reuse)r  rR  r  )r   r>  r=  del_lines       rS   codegen_exact_buffer_reusez/PythonWrapperCodegen.codegen_exact_buffer_reuse  s@    ../zXJxjQUQ\Q\P]]_`d`l`l_mmsttrU   c                P    |$| j                  | j                   d| d|        y y )Nz [Provenance debug handles] rT  )r   r  )r   r}  debug_handles      rS   write_provenance_debug_handlez2PythonWrapperCodegen.write_provenance_debug_handle  s4    
 #NN<<. <[M<.Y $rU   c                r   |j                         |j                         k(  sJ |j                         }|j                         }d}|t        j                  j	                         vr|rd| j                  |       }|j                         |j                         k(  r4|j                         |j                         k(  r| j                  |||      S | j                  ||j                         |j                         d| j                  j                        }| j                   | d| | d| j                   dS )N;z; r   r   r;  rF  )rM   rJ   r3   rH   r  r  r,  r-  rH  r  r  r   rQ  r  )r   r  r  r  r>  r=  rG  reinterpret_views           rS   r  z&PythonWrapperCodegen.make_buffer_reuse  s   }}#--/111<<><<>1773355*D11#678H<<>S\\^+0@CNNDT0T228XxPP88!11d6G6G6Q6Q
 ,,z-=,>xj4<<.X^__rU   c                    | j                  t        || j                   | d|j                          | j                   d| j
                   d             y )Nr   r;  z alias)r   r7   rQ  rz  rR  r  )r   r   r  s      rS   r	  z0PythonWrapperCodegen.codegen_deferred_allocation  sS    <<.c$*@*@*B)CDKK=PRSWS_S_R``fg	
rU   c                p   |j                         }|t        j                  j                  v sG|| j                  v s9t        |t        j                  t        j                  t        j                  f      ry | j                  j                  |       t        |j                         t        j                  t        j                  f      r|j                         sy |j                         }t        |t        j                         ry t        |t        j"                        ry t        |t        j$                        r+t        |j&                  t        j(                        s*J dt+        |j&                         d|j&                          |j&                  j,                  }t        |t        j.                        sJ t+        |             |j,                  }t        |t        j0                  t        j(                  f      sJ t+        |             t        |t        j(                        rdfd |      }| j3                  |       | j5                  t7        | |||             y t        |t        j8                        r| j5                  t;        | |             y | j5                  t=        | |             y )Nzunexpected r   c                   t        | t        j                        r | j                               S t        | t        j                        r | j
                        S t        | t        j                        sJ t        |              | S rG   )r_   r!   BaseViewunwrap_viewr(  rh   r`   r}   )targetunwrap_viewss    rS   rU  z=PythonWrapperCodegen.codegen_allocation.<locals>.unwrap_views  sd    !&"++6+F,>,>,@AA!&"--8+FKK88%fbii8F$v,F8!MrU   )r   r  )rJ   r3   rH   r  r  r_   r!   DonatedBufferSubgraphBufferInputBufferr   get_defining_opExternKernelAllocMultiOutputshould_allocater!  MutationLayoutSHOULDREMOVErG  r  r  r$   r}   rh   rf   r`   codegen_allocationr   r  r"  r*  r  )r   r  r   r  boxinput_bufferrU  s         @rS   r^  z'PythonWrapperCodegen.codegen_allocation  s
     AGG+++t~~%&2#3#3R5F5F"WX4 &&(%%r~~6 **,'')fb;;<fbmm,fb001fkk2+=+=> d6;;/06;;-@> ++""Cc2==1<49<188LlRYY8J8J,KL dO L ,(:(:;"  ,L9##L1NN?4vvNOfb112NN1$?@|D&12rU   c                   |j                         }t        |t        j                  t        j                  f      r| j                  t        | |             y t        |j                         t        j                        r| j                  t        | |             y | j                  |      sy | j                  j                  |       | j                  t        | |             y rG   )rJ   r_   r!   rX  r  r   r  r!  r"  r:  	can_reuser  r   r  )r   r  r   s      rS   codegen_freez!PythonWrapperCodegen.codegen_free  s      fr~~r/A/ABCNN8D&12f,,.0C0CD NN-dF;<~~f%

t*489rU   c                   |j                         }|t        j                  j                  v xs |t        j                  j                  v xr6 t        t        j                  j                  |   t        j                         xsh |t        j                  j                  v xsJ |t        j                  j                  v xs, |t        j                  j                  v xs || j                  v  S rG   )rJ   r3   rH   r  r  r_   graph_inputs_originalr!   rV  r[  r\  never_reuse_buffersr  )r   r`  output_bufferr   s       rS   rb  zPythonWrapperCodegen.can_reuse  s    $$&AGG+++ 
",,, "GG11$79I9I 
" qww(((
" qww222
" qww222
" tzz!
 	
rU   c                    |j                         | j                  v xr. | j                  |j                            |j                         k(  S rG   )rJ   r  )r   r  reused_buffers      rS   	did_reusezPythonWrapperCodegen.did_reuse  sC     OO, KFOO-.-2H2H2JJ	
rU   c                t   t        ||      sJ | j                  |       | j                  j                  |j	                                | j
                  j                  |j	                                |j	                         | j                  |j	                         <   | j                  t        | ||             y rG   )	r]   r^  r  r   rJ   r  r  r   r  )r   r`  rg  s      rS   codegen_inplace_reusez*PythonWrapperCodegen.codegen_inplace_reuse  s    $\=AAA-

|,,./=11340<0E0E0GM**,-y|]CDrU   c                    t        |      }|| j                  v r|S | j                  j                  |       | j                  |z   S rG   )r   r  r   rQ  )r   r   r   s      rS   codegen_unbacked_symbol_declz1PythonWrapperCodegen.codegen_unbacked_symbol_decl  sC    6{4---K &&**40<<$&&rU   c                    t        t        j                  j                  j                  |      }| j                  t        | |||             y rG   )r   r3   rH   rN   	shape_envr   rz  )r   r{  r|  r~  s       rS   (codegen_unbacked_symbol_defs_for_outputsz=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs%  sB     6GG&&(9
 	"4g?PQ	
rU   c                    |sy |j                         D ]I  \  }dfdfd}| j                  | j                  |       d |        | j                          K y )Nc                   |dk(  r| S t        |      dk\  r_t        |d   t              rLt        |d   t        j                        r/ |  d|d   j
                   d|d   j                   d|dd        S t        |d   t              r |  d|d   j
                   d|dd        S t        |d   t        j                        rYt        j                  j                  r  d	|d   j                   d
|  d|dd        S  |  d|d   j                   d|dd        S t        |d   t              r |  d|d   j                   d|dd        S t        d|       )Nrv   r   r   r4   r   r  r   r  z	std::get<z>(rG  rH  z.__floordiv__(rH  )r   r_   r   pytreeSequenceKeyr   re  r3   rH   rU  r   rK  r  )r  rJ  gos     rS   rv  zJPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.goE  s   b=K LA%"71:}="71:v/A/AB&'!*//!2!GAJNN3C1Ewqr{   
M:a
'8;WQR[II
F,>,>? 77.. Ywqz~~&6ba@'!"+N  4&'!*..)9 ;WQR[I
  
K8 nWQZ5G5G4HJGTUTVKXX(+@	)JKKrU   c                    t         j                  j                  rt              dk(  rZd   }  d   j	                         t        | t        j                        r!t        | j                        dk7  r	dd        S       S t        d   t        j                        sJ  d   j                     j	                         dd        S        S )Nr4   r   )r3   rH   rU  r   rJ   r_   r!   r[  rD  rt  ru  re  )rT  rv  rJ  r{  r|  s    rS   go_outerzPPythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outerc  s    77&&
 7|q(%aj  "#AJ//1)#r~~>3s{{CSWXCX $ABK   ")	    *'!*f6H6HIII!''!*.."9"B"B"DgabkRRk733rU   r   )r  r   rJ  zpytree.KeyPath)r  r   rn  rR  )r   r{  r|  r~  r  rx  rv  rJ  s    ``   @@rS   r  z>PythonWrapperCodegen._codegen_unbacked_symbol_defs_for_outputs2  sg     ! ,113 <	JAw
L<4. NN44Q78HJ<}Uu<	rU   c                     fd} fd}	  j                  j                          j                   j                   dj                           |        t
        j                  }t        j                  j                        5  j                  j                  |       d d d         |         j                          y # 1 sw Y   !xY w#  j                          w xY w)Nc                    t        j                  j                        t              k(  sJ t        j                  j                        D ]3  \  } }j	                  j
                   |  d| j                          5 y ry  )r   rH   r  r   r   rQ  rR  )inner_inputouter_inputouter_inputsr   subgraphs     rS   _codegen_subgraph_prefixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefix  sy    x~~223s<7HHHH,/++\- ([ ||n[M[M$++OrU   c                    t        j                  j                        t              k(  sJ t        j                  j                        D ]5  \  } }j	                  | d| j                          j                          7 y ry  )r   rH   r  r   r   rz  rR  )inner_outputouter_outputouter_outputsr   r~  s     rS   _codegen_subgraph_suffixzSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix  s{    x~~334M8JJJJ.1,,m/ *l #nC(F(F(H'I$++WrU    subgraph: )parent_graph)	r&  rH   r   r  r   r3   set_graph_handlercodegen_subgraphrH  )r   r~  r}  r  r  r  r  s   ````   rS   codegen_subgraph_by_inliningz1PythonWrapperCodegen.codegen_subgraph_by_inlining~  s    			'%%hnn5NNdll^;x}}oFG$&77L$$X^^4 //!- 0  %&$$&  $$&s$   A;C C,C CC C*c           	        |j                   }|j                  }t        |j                               |j                  D cg c]  }|j
                   c}z   }dj                  |      t        |      dk(  rdndz   }|D cg c]  }|j                          }	}dj                  |	      t        |      dk(  rdndz   }
| j                  d| d| d       |j                         D cg c]
  \  }}|s	| }}}|r#| j                  ddj                  |              | j                  d	|
 d
| d| d       | j                  d| d       yc c}w c c}w c c}}w )z'Generate code to call a graph partitionr  r4   r  r  	partition	_args = [rH  r  r  z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesr  r  symbol_inputsr   r   r   rJ   r   r  )r   partition_idr  r  r  symbol_inputr  r`  rP   output_namesr|  r   
deallocaterC  s                 rS   codegen_partition_callz+PythonWrapperCodegen.codegen_partition_call  sj    2DD+88-22452F2T2T9
".L9
 
 ;'#k2Ba2G3RP4@ADAA))L)C4E4JSPRS 	<.	&CD *<)A)A)C
%T:zD
 
 NNT$))L"9!:;< 	y,\N+l^SYZ	
 	|nE:;-9
 B
s   E?E'
E2Ec                P    t        |      D cg c]  }d| 	 c}| _        y c c}w )N
partition_)r  r6  )r   num_partitionsre  s      rS   set_all_partition_namesz,PythonWrapperCodegen.set_all_partition_names  s$    BGBW#X3j$6#X #Xs   #c           	     p   dj                  |      t        |      dk(  rdndz   }dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       | j                  d| d|j                  j                   d|j                  j                   d	       y )
Nr  r4   r  r  r  rH  r  z) = r  )r   r   r   rH   r   )r   r~  r}  outer_flattened_outputsouter_output_namesouter_input_namess         rS   ,codegen_subgraph_call_with_flattened_outputszAPythonWrapperCodegen.codegen_subgraph_call_with_flattened_outputs  s     "YY'>?./14C"
 !IIl3|$)Cr
 	(..--.i8I7J!LM 	"#4(;(;'<Ahnn>Q>Q=RRXY	
rU   c                v   dj                  |      t        |      dk(  rdndz   }| j                  |j                  j                   d| d       t
        j                  j                  j                          | j                  | d|j                  j                   d|j                  j                   d	       y )
Nr  r4   r  r  r  rH  r   r  r  )r   r   r   rH   r   r3   r  free_buffers)r   r~  r}  outer_buffer_namer  s        rS   codegen_subgraph_callz*PythonWrapperCodegen.codegen_subgraph_call  s     IIl3|$)Cr
 	(..--.i8I7J!LM 	
&&( 	 !X^^%8%8$98>>;N;N:OvV	
rU   c                    | j                  |j                         | j                  d       | j                  | j                   d|j                          t
        j                  }|j                  |j                  _        |j                  |j                  _        |j                  j                  | j                  vrt        j                  |j                        5  t        j                  dd      5  |j                  j                         \  }}d d d        d d d        |j                  j                  }| j                  j                  |       | j                  |       y y # 1 sw Y   VxY w# 1 sw Y   ZxY w)Nr  r  r	  F)r&  rH   r9  r  r   r3   rU  
fx_wrapperr  r  r    patchr*  r   rr  )r   r~  r  rq  r  r  s         rS   codegen_subgraph_commonz,PythonWrapperCodegen.codegen_subgraph_common  s/   !!(..1"T\\N+hmm_EFww%1%=%="$0$;$;!>>d&F&FF $$X^^4 @\\"3U; @'/~~'='='?$M1@@
 %NN//M,,00?,,]MJ G
@ @@ @s$   E45E(E4(E1	-E44E=c                L    | j                  |       | j                  |||       y rG   )r  r  )r   r~  r}  r  s       rS   'codegen_subgraph_with_flattened_outputsz<PythonWrapperCodegen.codegen_subgraph_with_flattened_outputs  s(     	$$X.99l$;	
rU   c                L    | j                  |       | j                  |||       y rG   )r  r  )r   r~  r}  r  s       rS   r  z%PythonWrapperCodegen.codegen_subgraph  s%     	$$X.""8\;LMrU   c                   |j                         }| j                  | dt        |j                                |j                  D cg c]  }|j                          }}t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j                  |j                  ||       y | j                  |j                  ||       y c c}w c c}w )N = [None] * rG  rH  )rJ   r   r   r|  r`  rz  r3   rH   rV  r  r  r~  r  )r   invoke_subgraphr   r  r}  rN  r  s          rS   codegen_invoke_subgraphz,PythonWrapperCodegen.codegen_invoke_subgraph  s    '')$|C0G0G,H+IJK;J;Q;QRC--/RR77(-c/2I2I.J(K#$4&!AM  --((, !!/":":L$O Ss   C(C-c                   |j                         }|j                  D cg c]  }|j                          }}|j                  j                         }t	        |j                  t
        j                        s| d}| j                  | dt        |j                                | j                  d| d       | j                  t        | |j                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j                  ||       n| j#                  |j                  ||       | j                  t%        |              | j                  d       | j                  t        | |j&                  j                               t        j                  j                  rOt        t        |j                              D cg c]
  }| d| d }}| j!                  |j&                  ||       n| j#                  |j&                  ||       | j                  t%        |              y c c}w c c}w c c}w )NrF  r  r   rT  rG  rH  zelse:)rJ   operandsrz  	predicater_   r!   ShapeAsConstantBufferr   r   r|  r  true_subgraphrH   r3   rV  r  r  r  rD  false_subgraph)r   conditionalr   r  r}  r  rN  r  s           rS   codegen_conditionalz(PythonWrapperCodegen.codegen_conditional%  s   ##%;F;O;OPC--/PP));;=	+//1I1IJ$+W-I$|C0C0C,D+EFGYKq)*({/H/H/N/NOP775:3{?R?R;S5TUvQqc^UMU--))< !!+";";\4P'-.w({/I/I/O/OPQ775:3{?R?R;S5TUvQqc^UMU--**L- !!+"<"<lDQ'-.9 Q V Vs   I)"I.I3c                     fd}|j                         }|j                  D cg c]  }|j                          }}|j                  D cg c]  }|j                          }}t	        |      } j                  | dt	        |              |r  j                  | dt	        |       d       t        |      D ]  \  }	}
 j                  | d|	 d|
          g t        t	        |            D 	cg c]
  }	| d|	 d c}	|}| dg}t        |      }|d	t	        |       } ||j                  ||        j                  d
|d            j                  d       |rwt        |      D ]h  \  }	} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      j nvt        |      D ]h  \  }	} j                  t         |j                  j                                j                  | d|	 d| d        j                  t                      j  j                  d        j                  t         |j                  j                                ||j                  ||        j                  t                      |r j                  t         |j                  j                               t        t	        |            D ]"  }	 j                  | d|	|z    d| d|	 d       $  j                  t                       j                  t         |j                  j                                ||j                  ||        j                  t                       j                  d|d           |r j                  d       t        t	        |            D ]  }	 j                  d| d|	|z    d        j                  t         |j                  j                                j                  | d|	 d| d|	|z    d        j                  t                       y	y	c c}w c c}w c c}	w )z1while_loop is codegened as a host side while_loopc                    t         j                  j                  rj                  | ||       yj	                  | ||       y)z3Helper method to deduplicate subgraph codegen logicN)r3   rH   rV  r  r  )r~  r}  r  r   s      rS   r  zAPythonWrapperCodegen.codegen_while_loop.<locals>.codegen_subgraphI  s7    ww11(L-X<<lMrU   r  z.extend([[] for _ in range(z)])rG  z] = rH  _cond_resultNzshould_loop = r   zif not should_loop:z.unsqueeze(0).clone()rD  zwhile should_loop:z	].append(rY  z    should_loop = z%# Stack outputs after loop completionzif len(z]) > 0:z] = torch.stack(z	], dim=0))rJ   carried_inputsrz  additional_inputsr   r   r  r  r  cond_subgraphr  body_subgraphrH   rD  )r   
while_loopstack_outputr  r   r  outer_carried_inputsouter_additional_inputs
ckp_offsetrN  inpcond_outer_inputscond_outer_outputsbody_outer_inputsbody_outer_outputscarried_inputs   `               rS   codegen_while_loopz'PythonWrapperCodegen.codegen_while_loopF  s   	 ""$/9/H/H 
(+C!!# 
  
 0:/K/K#
(+C!!##
 #
 -.
$|C0D,E+FGHNN&3C8L4M3NcR   45 	3FAsNNdV1QCtC512	3
&+C0D,E&FGas!nG
$
 "&l34 
 //J5I1JK$$&79K	
 	(:1(='>?@,-$-.B$C 7 =0z7O7O7U7UVW$q4>STU/567
 %..B$C 7 =0z7O7O7U7UVW$q4hGH/567
 	+,(z/G/G/M/MNO$$&79K	
 	'-. NN,T:3K3K3Q3QRS3345 P$qZ(8	$q2NOPNN+D12 	(z/G/G/M/MNO$$&79K	
 	'-.+,>q,A+BCD NNBC3345 7aJ/?wGH0z7O7O7U7UVWfAaS 0aJ7GyQ /567 Q 
#
  Hs   Q+Q04Q5c                    	 t        | dd       ry t        | t              r| S t        j                  j
                  j                  |       }||S t        |      S # t        $ r Y y w xY w)Nr  )r  r_   r   r3   rH   
_shape_env_maybe_evaluate_staticr  )r   r  s     rS   statically_known_int_or_nonez1PythonWrapperCodegen.statically_known_int_or_none  sf    	q.$/ !S!''$$;;A>C{
s8O 		s!   A A ,A 
A 	A&%A&c                l    g }| D ],  }t         j                  |      }| y |j                  |       . |S rG   )r   r  rg   )lstr3  r   r  s       rS   %statically_known_list_of_ints_or_nonez:PythonWrapperCodegen.statically_known_list_of_ints_or_none  sA     	A&CCAFC{MM#		
 rU   c                0    t         j                  |       d uS rG   )r   r  )r  s    rS    is_statically_known_list_of_intsz5PythonWrapperCodegen.is_statically_known_list_of_ints  s     !FFsKSWW	
rU   c                H    t         j                  | j                               S rG   )r   r  r,  r  s    rS   r  z4PythonWrapperCodegen.static_shape_for_buffer_or_none  s    #IIOO
 	
rU   c                0    t         j                  |       d uS rG   )r   r  r  s    rS   !can_prove_buffer_has_static_shapez6PythonWrapperCodegen.can_prove_buffer_has_static_shape  s    #CCFKSWWWrU   c                     y rG   rv   )r   r}  node_schedules      rS   write_kernel_context_guardz/PythonWrapperCodegen.write_kernel_context_guard  s    
 	rU   c                     y)z<
        Mark the beginning of kernel context guard
        Nrv   r   s    rS    write_kernel_context_guard_beginz5PythonWrapperCodegen.write_kernel_context_guard_begin       	rU   c                     y)z6
        Mark the end of kernel context guard
        Nrv   r   s    rS   write_kernel_context_guard_endz3PythonWrapperCodegen.write_kernel_context_guard_end  r  rU   rG   )r  r   r  r   r  Optional[PythonWrapperCodegen]r  $Optional[ir.GraphPartitionSignature]r-  )r   r   r  r   r   r  )r  r   )r  TritonMetaParamsr   r   r   r  )r   z>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]r   zlist[IRNode])r  r  r'  )rN  r   r  r   r   r   r  )rN  r   r   r  )r/  r  r   r  r3  r+   r   r  )rP   zir.FallbackKernelr   r  )rP   rg  )rP   rq  r   r  )r   r   rT  r   rU  r   rl  r  r  r   rV  zOptional[OrderedSet[str]]r   r  )F)rP   ri  )rP   rX  r   r  )r  r   rp  r   r  zCallable[[], Sequence[str]]r  z<Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]r  rC  r|  zSequence[ir.Buffer]r   r  )r  Callable[..., None]r   zIterator[Callable[..., None]])r  r   )r   r   r   r  r  r  )r  r   )r   r   rO   r   r   r   )r   r   r   r   )rL  r   r   r   r  r   r   r   )r/  zSequence[Expr]r   r   )r   ztuple[str, list[str]])r   r  r   r   )r  zUnion[bool, str])rP   zir.MultiOutput)NTN)
r}  r   r  r   r  r   r  r   r  r   )r}  r   r  r   r  r   )r   r   )r   z"list[list[Union[int, sympy.Expr]]])r}  r   r  r   )rt  r   rH   rA   r   r  )r  r9   )r}  r   )r  r  )NF)r  )r  r  )rC  r  )r>  r   r=  r   rG  r   )rJ  rO  )r  r  r  r  r  r   )r   r   r  zir.ReinterpretViewr   r  r  r  )r`  r  rg  r  )r{  r   r|  r   r~  r}  r   r  )r  r   r  zir.GraphPartitionSignature)r  r   )r}  r   r  z0Union[Sequence[BaseSchedulerNode], ExternKernel])r   r   r   rU  supports_cachingr  r7  r  r  r  r  r  r  r'   r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r&  rH  r"  rF  r(  r*  r,  r0  r4  r8  r:  r<  rA  rk  rO  ry  ro  rs  rw  rz  r|  ro  r  rb  r  r  r  r   contextmanagerr  r  r  r  r  r  r  r  r  r  r  r  r  r  rJ  r   r3  r	  r  r   r#  r,  rA  rC  r@  ra  r  rf  rn  r  rr  r  r  rv  r  r  r  r  r  r  r  r  r  r  r|   r  r  r  r   rZ  r  ra  r  r2  r  r9  r?  r  rD  rH  rK  r  r	  r^  rc  rb  rj  rl  rn  rq  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  s   @rS   r   r   "  s~    ]#~ 
 FJ	&&$& 7& C	& &'<@AD8  "	B + +! 
 

$	G$
%S$	(
.8):6-.10J
/+7&5
5$:,:8 8 
8 37;; ;  	;
 ; ; 0; 
; ,U
8(B<
	V	V  	V .		V
 R	V  	V %	V 
	V0 ! !,P
d.S`:
 
D%(K(K (K -	(KT'?RD @D W CG +.&'6	, <P '<P 
<P|>S
IG264*_YB, #'(,

 
  	

 
 &
& FJ		'*	6C	 	 #'(,!! !  	!
 ! &!,>L; 2L;\*A"A+8A	A
%N:.>85W

2
4SGj' !2
2
r !v(v(p !,$N	
 8 8 TY.`*_*Au '+ $` 
53n:(
 
E'

 
 H	

 

JJ J H	J
 
JX+'Z<< 9<BY
$
"K*
NP /B]7~     
 

 
 

 X X HrU   r   c                       e Zd ZdZ	 d	 	 	 	 	 d fdZddZddZd Zd Zd Z	ddZ
dd	Zdd
ZddZ	 	 ddZddZddZd fdZedd       Zedd       Zedd       Zd Z xZS )r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    c                    || _         || _        || _        t        |           | j                         }|j                  | _        |j                  | _        |j                  | _        |j                  | _	        y rG   )
r  r  r  r   r  get_root_graphr  r   r  r  )r   r  r  r  rootr  s        rS   r  z%SubgraphPythonWrapperCodegen.__init__  sn     +,$8!""$$($=$=!%)%?%?"!//)-)G)G&rU   c                &    | j                   | _        y rG   )r  r  r   s    rS   r  z1SubgraphPythonWrapperCodegen.set_launcher_fn_name  s     !% 2 2rU   c                     y rG   rv   r   s    rS   r  z)SubgraphPythonWrapperCodegen.write_header  r  rU   c                     y rG   rv   rd  s     rS   r  z2SubgraphPythonWrapperCodegen.add_benchmark_harness  r  rU   c                     y rG   rv   rd  s     rS   ra  z6SubgraphPythonWrapperCodegen.benchmark_compiled_module  r  rU   c                     y rG   rv   r   s    rS   r  z5SubgraphPythonWrapperCodegen.write_async_compile_wait  r  rU   c                6    | j                   j                         S rG   )r  r(  r   s    rS   r(  z/SubgraphPythonWrapperCodegen.next_kernel_suffix  s    ""5577rU   c                     y rG   rv   r2  s     rS   r8  z2SubgraphPythonWrapperCodegen.generate_after_suffix  r  rU   c                \    | j                   j                  d| j                   d       d}|S )Nz
            def z(args):
            r4   )r  r   r  r
  s     rS   r  z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indent  s<    &&' (	

 rU   c                     yr   rv   r   s    rS   r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indent(  s    rU   c                    | j                   x}r3|j                  |j                  D ci c]  }t        |      | c}z  }|S t        j
                  j                  }|S c c}w rG   )r  input_nodesr  r   r3   rH   r  )r   r  r  r`  s       rS   r  z-SubgraphPythonWrapperCodegen.get_graph_inputs+  sh     11191**#,#:#:.A	. F
  WW))F.s   Ac                    | j                   x}rJt        |j                  j                               |j                  D cg c]  }|j
                   c}z   }|S t        j                  j                  }|S c c}w rG   )	r  r  r  r  r  r   r3   rH   r  )r   r  r  namess       rS   r  z2SubgraphPythonWrapperCodegen.get_graph_input_names6  sr    11191..33566?6M6M:&2!!: E
  GG--E:s   A5c                r    | j                   x}r|j                  }|S t        j                  j                  }|S rG   )r  r  r3   rH   r  )r   r  r|  s      rS   r  z.SubgraphPythonWrapperCodegen.get_graph_outputs?  s;    11191,,G  gg++GrU   c                ~    |j                         }| j                  x}r||j                  v ry t        |   |       y rG   )rJ   r  r  r   r^  )r   r  r   r  r  s       rS   r^  z/SubgraphPythonWrapperCodegen.codegen_allocationF  s?     222I2	@U@U8U "6*rU   c                8    | j                   j                          y rG   )r  r  r   s    rS   r  z5SubgraphPythonWrapperCodegen.write_triton_header_onceP  s     	446rU   c                8    | j                   j                          y rG   )r  r  r   s    rS   r  z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_onceY  s     	<<>rU   c                    | }t        |t              r|j                  }t        |t              rt        |t              sJ |S rG   )r_   r  r  r   )r   r  s     rS   r  z+SubgraphPythonWrapperCodegen.get_root_graphc  sB    DH;<&&D ;< $ 4555rU   c                     y rG   rv   r   s    rS   r  z<SubgraphPythonWrapperCodegen.generate_and_run_autotune_blockl  s    rU   rG   )r  r   r  r   r  r  r-  r  r  r'  )r   zDdict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr, None]]r  r  r  )r   r   )r   r   r   rU  r  r  r  r  ra  r  r(  r8  r  r  r  r  r  r^  r'   r  r  r  r  r  r  s   @rS   r  r    s     FJ	HH -H C	H.38		M	+ 7 7 ? ?  rU   r  )rP   r  r   r  )rY   r  rZ   r  )NN)r   r   r   zlist[triton.Config]r   zlist[TritonGrid]r~   r  r   r   r   ztuple[str, str]r  )
__future__r   r  r   r  r   r  r  r  r  r4  ri  r  collections.abcr   	itertoolsr   r   typingr   r   r	   r
   rp   r   r  
torch._opstorch.utils._pytreeutils_pytreert  r   r  torch._dynamo.utilsr   r   #torch._inductor.codegen.debug_utilsr   $torch._inductor.codegen.multi_kernelr   %torch._inductor.runtime.runtime_utilsr   torch._library.opaque_objectr   torch._loggingr   %torch.fx.experimental.symbolic_shapesr   r   r   r   r   torch.fx.noder   torch.utils._ordered_setr    torch.utils._sympy.singleton_intr   torch.utils._sympy.symbolr   r   r  r   r    r!   	codecacher"   r#   r$   r   r%   runtime.hintsr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   virtualizedr3   r  r5   r6   r7   r8   r9   r:   	cpp_utilsr;   triton_utilsr<   r=   r>   r?   r@   r   rH   rA   rB   r  rC   wrapper_fxirrD   	getLoggerr   logdoprintr  r   r  r   r   r  r`   r  r  rT   r]   rm   rK  r   r  r   r   r   r%  r   r   rE   r  r1  r9  r>  rD  rM  r`  rf  rp  r  r  r  r  r  r  r  r  r  r  r  r*  r:  r@  rW  rh  rs  rz  r  Liner   r  rv   rU   rS   <module>r     s   "    
      	  $ " 6 6     $ $ & 6 C A ; = +  . / 9 : ( ( ' ( ' ,       P P 2%!-) g! u{{C56299l*+
]OT12 >QF S> 	%UZZ
 #
%&2B1CU3PS8_1T(UU
 /3*.k&
k& k& k& ,	k&
 (k& k&\W&t   * **Y Y
 2 2 2 	/k 	/ 	/ ++ + + 	2 	2 	2 1{ 1 1 "@K "@ "@J?; ? 
7K 
7 
7 5+ 5 5< 	({ 	( 	( /[ / /> 5; 5 5* ; ; ;2%
 %
P 3,% 3, 3,l 6, 6 6> /( / /& )" ) ).(! (
 ![ ! !: )8^ )8 )8X 4 4 4 #0k #0 #0L 6; 6 6, 4+ 4 48 	5+ 	5 	5 8[ 8 8 
,-B+7 B+JVG#7 GrU   