
    qi`?                     T   d dl Z d dlZd dlmZ d dlmZmZ d dlZd dlm	c m
Z
 d dlmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlmZ d d	lmZ  ej:                  e      Zd
ej@                  jB                  de"e   de#defdZ$ G d dejJ                        Z& G d de      Z'y)    N)Callable)AnyUnion)ir)KernelTemplate)BufferFixedLayoutget_free_symbolsget_symbolic_inputsgm_original_output_stridesir_node_to_tensorLayout)benchmarker)do_bench_using_profilingVgminputsnamereturnc                      ddl m}  || |      S )a.  Inline a subgraph by converting its FX operations to individual IR nodes.

    This converts a subgraph to multiple ComputedBuffer nodes (fusable),
    enabling epilogue fusion with subsequent operations.

    Returns:
        TensorBox containing the final operation result as individual IR nodes
    r   )process_subgraph_nodes)torch._inductor.loweringr   )r   r   r   r   s       f/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/codegen/subgraph.pyinline_subgraph_to_ir_nodesr      s     @!"f--    c                   F    e Zd ZdZdedee   dededede	f   dd	f fd
Z
defdZdee	   dee	ee	   f   fdZdee	   dej                  defdZdee	   dej                  dd	fdZdefdZdeej,                  ej.                  f   fdZdeee	f   fdZdefdZ xZS )SubgraphChoiceCallerz
    Represents a Subgraph Autotuning choice, and the subgraph can be any arbitrary
    GraphModule. Compiles the Subgraph down to a module for benchmarking.
    r   input_nodeslayoutdescriptionmake_fx_graph.r   Nc                 h   t         |   ||||       g | _        t        j                  5  | j
                  D ]  }t        t        |j                         d            dk(  sJ t        t        |j                         d            dk(  sJ |j                  j                          | j                  j                  t        |              	 d d d         || j                   | _        t        | j                         t!        | j
                        | _        d | _        d | _        y # 1 sw Y   [xY w)NT)unbacked_onlyr   )super__init__example_inputsr   	fake_moder   lenr
   get_size
get_stridedatafreeze_layoutappendr   r   r   r   
sym_inputs_compiled_module_compiled_sym_inputs)selfr   r   r    r!   r"   inp	__class__s          r   r&   zSubgraphChoiceCaller.__init__1   s
    	{FK@ [[ 	C'' C+CLLN$OPTUUUU+CNN,<DQRVWWWW&&(##**+<S+ABC	C  !4!45"477+-d.>.>? &*6:!!	C 	Cs   B"D((D1c                 "    d| j                    dS )NzSubgraphCaller()r   r2   s    r   __str__zSubgraphChoiceCaller.__str__N   s     1--r   argsc                 ~   ddl mc m} ddlm} | j
                  j                  dd      j                  dd      } || j                  | j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                   d| 	      }| j"                  D ]@  }||j$                  |j
                  <   |j&                  j)                  |j
                         B | j"                  D cg c]>  }t+        t        j                  j,                  j.                  j1                  |            @ }}t3        |      dk(  rt5        || j                        D ]z  \  }	}
t7        |	t8        j:                        s!t7        |
t8        j:                        sJ |	j<                  |
j<                  k(  sJ |	j?                         |
j?                         k(  rzJ  t        j@                  |      5  |jC                  d	d	d
      5   |jD                  | j                    |jG                         }ddd       ddd       |fS c c}w # 1 sw Y   xY w# 1 sw Y   |fS xY w)z
        Compile the subgraph for benchmarking and return (module, sym_inputs).

        TODO: Add precompile() method to enable parallel compilation of all choices
        before benchmarking.
        r   N)GraphLoweringz::_.
benchmark_)	r   r'   	shape_envcpp_wrapperaot_modeextern_node_serializeris_inferenceis_backwardr   FATEN)max_autotunemax_autotune_gemmmax_autotune_gemm_backends)$torch._inductor.config	_inductorconfigtorch._inductor.graphr<   r   replacer   r'   r   graph
_shape_envrA   rB   rC   rD   rE   r/   graph_inputsgraph_input_namesr.   intsizevarsr@   	size_hintr)   zip
isinstancetorchTensorshapestrideset_graph_handlerpatchruncompile_to_module)r2   r:   inductor_configr<   	safe_namebm_graph_loweringsym_inpsym_varr/   arexample_inpmods               r   _compile_for_benchmarkingz.SubgraphChoiceCaller._compile_for_benchmarkingQ   sI    	987II%%dC088cB	)ww..gg((++WW%%#$77#A#A--++i[)

  	EG;B**7<<8//66w||D	E  ??
    **44W=>

 
 z?a $'tT-@-@#A ?Kb%,,/%k5<<@@@88{'8'888899;+*<*<*>>>>?   !23 	< &&""'+1 '  <
 &!%%t':':;'99;<	< J7
&< <	< Js+   AJJ0!*J$J0$J-	)J00J<outc                T   | j                   ! | j                   \  }|| _         | _        n| j                   }| j                  J |j                  t        j
                  rt        fd      S t        j                  fdt        j                  g        S )zT
        Regular benchmarking: compile and use benchmarker with warmup/rep.
        c                       g        S N r:   bm_funcr/   s   r   <lambda>z0SubgraphChoiceCaller.benchmark.<locals>.<lambda>   s    G<Pj<P4<P4Q r   c                       g        S rl   rm   rn   s   r   rp   z0SubgraphChoiceCaller.benchmark.<locals>.<lambda>   s    G0j0401 r   )device)
r0   rh   r1   callrL   /profile_bandwidth_with_do_bench_using_profilingr   r   	benchmarkinfer_device)r2   ri   r:   rg   ro   r/   s     ` @@r   ru   zSubgraphChoiceCaller.benchmark   s       (<d<<dCOC$'D!(2D%''C22J)))((AA+,QRR$$1++?Z?$?
 	
r   c                    | j                   ! | j                  | \  }}|| _         || _        n| j                   }| j                  }|J |j                  } |g ||       y)z
        Only run once with cached compiled module.
        Called by benchmark_collective_choice which handles warmup
        and timing with barrier synchronization across all ranks.
        N)r0   rh   r1   rs   )r2   ri   r:   rg   r/   ro   s         r   benchmark_collectivez)SubgraphChoiceCaller.benchmark_collective   st       (<d<<dCOC$'D!(2D%''C22J)))(($*$t$%r   c           
      v   dj                  | j                  j                  dd      d   g| j                  D cg c]  }t	        |j                                c}| j                  D cg c]  }t	        |j                                c}t	        | j                  j                              S c c}w c c}w )N-r=      r   )	joinr   rsplitr   strr*   r+   r   rO   )r2   r3   s     r   hash_keyzSubgraphChoiceCaller.hash_key   s    xx		  a(+151A1AB##clln%B 483C3CDC#cnn&'D DGGMM"	
 	
 CDs    B1
* B6
c           	          t         j                  j                  t        j                  | j                  | j
                  | j                  | j                  | j                              S )N)r    r   r   r'   subgraph_name)	r   	TensorBoxcreateSubgraphBufferr    r   r   r'   r   r8   s    r   output_nodez SubgraphChoiceCaller.output_node   sN    ||""{{ ,,77#22"ii
 	
r   c                      d| j                   dS )zRInformation returned here is logged to the autotune log file when that is enabled.subgraph)backendkernel_namer7   r8   s    r   	info_dictzSubgraphChoiceCaller.info_dict   s     "99
 	
r   c                      d| j                    S )N	subgraph_r7   r8   s    r   autoheuristic_idz%SubgraphChoiceCaller.autoheuristic_id   s    499+&&r   )__name__
__module____qualname____doc__r~   listr   r   r   r   r&   r9   tuplerh   rX   rY   floatru   rx   r   r   r   r   ShapeAsConstantBufferr   dictr   r   __classcell__r4   s   @r   r   r   +   s   
;; &\; 	;
 ;  S); 
;:. .7tCy 7U3S	>=R 7r
tCy 
u|| 
 
,&$s) &%,, &4 &$
# 
	
U2<<1I1I#IJ 	

4S> 
'# 'r   r   c                       e Zd ZdZ ej
                         Zdef fdZ	 ddede	e
   dededef   ded	ed
efdZ	 ddede	edef      de	e
   de	eeef      dedef   dz  d
e	e   fdZdedef   d	eeef   d
efdZd	eeef   d
dfdZdede	edef      de	e   d
dfdZ	 dde	e
   dedef   d	eeef   dedef   dz  d
ef
dZ xZS )SubgraphTemplatea  
    A template for subgraph evaluation to be used in autotuning.

    This class allows creating customized subgraphs that can be appended
    as choices during the autotuning process, enabling the selection of
    optimal implementations for complex operations.
    r   c                 &    t         |   |       y)z
        Initialize a subgraph template.

        Args:
            name: The name of this template
            graph: The FX graph
        r7   N)r%   r&   )r2   r   r4   s     r   r&   zSubgraphTemplate.__init__   s     	d#r   r   r    r"   .r!   kwargsr   c                 Z    t        | dt        t        j                         ||||      S )a-  
        Generate a SubgraphChoiceCaller instance for autotuning.

        Args:
            name: The name for this subgraph choice
            input_nodes: List of input nodes to the subgraph
            layout: Memory layout information for the output
            make_fx_graph: Callable that creates the FX graph for this subgraph
            description: Optional description of this choice
            **kwargs: Additional keyword arguments

        Returns:
            SubgraphChoiceCaller: A callable object that can be used for autotuning
        r=   )r   r   r    r!   r"   )r   nextr   index_counter)r2   r   r   r    r"   r!   r   s          r   generatezSubgraphTemplate.generate   s8    0 $64 0 > >?@A##'
 	
r   Ndecompositionsnon_tensor_argsdefault_implc           
      T   |sg S t        |      t        |      k(  s J dt        |       dt        |       d       t        ||      D cg c]  \  }}| j                  ||||       }}}| j                  |||       |d   }	g }
t        ||      D ]  \  }}ddl||ddt
        dt        d	t
        f   d
t        t        t
        f   dt
        ffd}| j                  ||      }| j                  | d| ||	|d|j                         }|
j                  |        |
S c c}}w )a  
        Generate multiple SubgraphChoiceCaller instances for custom op autotuning.

        This method extends SubgraphTemplate to support custom op decompositions,
        allowing multiple implementations to compete in autotuning.

        Args:
            name: Base name for the choices
            decompositions: List of decomposition functions to compete in autotuning
            input_nodes: List of tensor inputs. All tensor arguments must be passed here.
            non_tensor_args: List of non-tensor kwargs only, one dict per corresponding decomposition.
            default_impl: Default implementation for layout inference

        Returns:
            List of SubgraphChoiceCaller instances for autotuning
        z>decompositions and non_tensor_args must have same length, got z decompositions and z kwargsr   N)decompdecomp_kwargsr:   r   .r   r   c                 f    ddl m} ddlm}  |       }  | j                  | fi ||      | S )Nr   )make_fx   )select_decomp_table)decomposition_table)"torch.fx.experimental.proxy_tensorr   decompositionr   partial)r   r   r:   r   r   r   	functoolss         r   r"   zBSubgraphTemplate.generate_custom_op_choices.<locals>.make_fx_graph9  sH     G?&9&;#w%I%%f>>(;  r   r=   z	CustomOp )r   r   r    r"   r!   )r)   rV   _infer_custom_op_layout_validate_layout_equivalencer   r   r   r   r~   _generate_variant_namer   r   r.   )r2   r   r   r   r   r   r   r   layoutsr    choicesr   r"   variant_namechoicer   s                  @r   generate_custom_op_choicesz+SubgraphTemplate.generate_custom_op_choices
  sx   0 I>"c/&:: 	
~&'';C<P;QQXZ	
: #&no"F
 ((fflS
 
 	))$H.0%(%I 	#!FM .40= c*  $CH~ 	$  66v}ML]]vQ|n-'+''89 # F NN6"?	#B W
s   D$r   c                     |j                   }|s|S dj                  d t        |j                               D              }| d| S )zLGenerate a descriptive name for a decomposition variant with its parameters.r=   c              3   0   K   | ]  \  }}| d |   yw)r=   Nrm   ).0kvs      r   	<genexpr>z:SubgraphTemplate._generate_variant_name.<locals>.<genexpr>_  s     Ntq!1#Qqc
Ns   )r   r|   sorteditems)r2   r   r   	base_nameparam_suffixs        r   r   z'SubgraphTemplate._generate_variant_nameX  sF     OO	xxNvflln7MNNAl^,,r   c                     |j                         D ]<  \  }}t        |t        j                  t        f      s'J d| dt        |       d        y)z8Validate that kwargs contains only non-tensor arguments.zkwargs['z'] contains tensor zo. Tensor arguments should be in input_nodes, not kwargs. Only scalar/non-tensor parameters should be in kwargs.N)r   rW   rX   rY   r   type)r2   r   keyvalues       r   _validate_non_tensor_kwargsz,SubgraphTemplate._validate_non_tensor_kwargsb  sZ     ,,. 	JC!%%,,)?@ 3%24;- @I J@	r   op_namer   c                 ,   |sy|d   }t        |dd d      D ]  \  }}|j                  |j                  |j                  |j                  f|j                  |j                  |j                  |j                  fk7  sdt        d| d||   j                   d|j                   d|j                   d|j                   d|j                   d	|d   j                   d|j                   d|j                   d|j                   d|j                   d
       y)zXEnsure all layouts have consistent stride, device, dtype, and sizes for fair autotuning.Nr   r{   )startzLayout mismatch in custom op 'z': decomposition 'z' produces (z, z) but 'r6   )	enumeraterr   dtypesizer[   AssertionErrorr   )r2   r   r   r   	referenceir    s          r   r   z-SubgraphTemplate._validate_layout_equivalencek  s(    AJ	"712;a8 	IAvv||V[[&--H    	M  %4WI >&&4Q&7&@&@%A BbbR W*1-667 8!(()IOO+<By~~>NbQZQaQaPbbc	e 	r   function_decompositionc                    ddl }ddlm} | j                  |       |j                  5  g }|D ]  }|j                         }	|j                  j                  j                  |	t        j                        }
t        j                  |
|j                         |j                               }|j                  |         |j                   |fi |} || }t#        |t        j$                        sJ dt'        |       d       t)        |j*                  |j,                  |j.                  |j1                               cddd       S # 1 sw Y   yxY w)	zInfer output layout for custom ops using the default implementation when available.
        Note that the Subgraph assumes custom ops return exactly one tensor output.
        TODO: Add support for multiple output custom ops.
        r   Nr   )fallback)r   rr   z#Expected single tensor output, got z:. Multi-output custom ops not yet supported in autotuning.)rr   r   r   r[   )r   torch._inductor.virtualizedr   r   r(   r*   rO   rT   
size_hintsrL   unbacked_symint_fallbackrX   empty	get_dtype
get_devicer.   r   rW   rY   r   r	   rr   r   rZ   r[   )r2   r   r   r   r   r   r   r'   r3   	raw_shapeconcrete_shapefake_tensorfnoutputs                 r   r   z(SubgraphTemplate._infer_custom_op_layout  s4    	1 	((0[[ 	N" 3LLN	!"!1!1!<!<(G(G "= " $kk"#--/#..BR %%k23 #""#9DVDB(F fell3 5d6l^ DK L3
 }}ll\\}}	+	 	 	s   DEE) rl   )r   r   r   r   	itertoolscountr   r~   r&   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r   r   r      s    $IOO%M$$& 

 &\
 	

  S)
 
 
 

L 37LL Xc3h/0L &\	L
 d38n-L sCx(4/L 
"	#L\-sCx(-26sCx.-	-$sCx. T  Xc3h/0 f	
 
> 37,&\, !)c 2, S#X	,
 sCx(4/, 
,r   r   )(r   loggingcollections.abcr   typingr   r   rX   rJ   rK   rL   torch._inductorr   torch._inductor.codegen.commonr   torch._inductor.irr   r	   r
   r   r   r   r   $torch._inductor.runtime.benchmarkingr   torch._inductor.utilsr   r   r   	getLoggerr   logfxGraphModuler   r~   r   ChoiceCallerr   r   rm   r   r   <module>r      s      $   ' '  9   = : ) g!..&*3i.7:.. d'2?? d'N_~ _r   