
    qi                     	   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ ddlmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZP 	 d dlQZQ eeQj                        ZSdZT ej                  eW      ZXej*                  j                  ZYej*                  j                  ZZ e=deNej                  j                  eSdk\  r eLd      n eLd      dd       Z] e=d!eO eLd"      #      Z^ e=d$eO eLd%      #      Z_ e=d&eO eLd'      #      Z` e=d(eO eLd)      #      Zae j                  d*        Zc e:ej                  d+eYj                  j                  ,      Zf e:ej                  d-d.eYj                  j                  /      Zh e:ej                  d0eYj                  j                  ,      Zj e:ej                  d1eYj                  j                  ,      Zl e:ej                  d2deYj                  j                  3      Zo e:ej                  d4eYj                  j                  ,      Zqd5 Zrdddd6d7Zsdgd9Zt e:esd      Zud: Zv G d; d<e*      Zw ew       Zx G d= d>e*      Zyd? Zzd@ Z{ eydAdBez      Z| eydCdDe{      Z} e6eYj                  dE      dhddFdG       Z~ e6eYj                  dE      ddFdH       Z e6eYj                  dE      ddddIdJ       Z e6eYj                  dE      dddKdL       Zej                  ej                  fej                  ej                  fej                  ej
                  fej                  ej                  fgZej                  ej                  gZej                  ej
                  gZdMed8efdNZdMedOed8efdPZdMedQedRedOed8ef
dSZdMedQed8efdTZ	 didUedVej                  dWedOed8ef
dXZd8efdYZdZed[ed\ej                  d]ej                  d8eeef   f
d^Z e6eYj                  j                  dE      	 	 	 	 	 djd_       Ze j                  d`ee   d8efda       Zdb Z	 	 dkdcee   fddZde Zdf Zy# eU$ r  ed      ZSdZTY w xY w)l    N)AnyOptionalUnion)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)gen_best_config)opsV)make_fx)ScalingType)TorchVersion   )configdistributed_autotune)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKTileGemmTemplate)CKGemmTemplate)SubgraphChoiceCallerSubgraphTemplate)BufferChoiceCaller	is_tritonLayout)MMKernelInputs)	loweringsmake_pointwisemake_reductionregister_loweringtransform_args)autotune_select_algorithmExternKernelChoiceKernelTemplaterealize_inputsTritonTemplate)_use_cutlass_for_opceildivuse_aten_gemm_kernelsuse_ck_gemm_templateuse_ck_tile_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_decompose_k_choice!use_triton_blackwell_tma_templateuse_triton_templateuse_triton_tma_template   )_is_static_problemload_kernel_templatemm_argsmm_gridpersistent_mm_griduse_native_matmulTz0.0.0Fmmz3.3.0	triton_mmtriton_mm_rocm)namegridsource"cache_codegen_enabled_for_templateprologue_loads_all_inputsmm_persistent_tmatriton_persistent_tma_mm)r@   rA   rB   %scaled_mm_device_tma_epilogue_scalingtriton_epilogue_scaled_mm&scaled_mm_device_tma_main_loop_scalingtriton_main_loop_scaled_mm"blackwell_ws_persistent_device_tma,triton_blackwell_ws_persistent_device_tma_mmc                     t        |       S N)r'   )fns    _/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/kernel/mm.pylazy_register_extern_choicerQ   {   s    b!!    z
at::mm_out)op_overloadzat::_mm_dtype_out_cudamm_dtype)r@   rS   zat::addmm_outzat::_int_mm_outzat::_sparse_semi_structured_mm)has_out_variantrS   zat::_scaled_mm_outc                 b    | j                         t        j                  t        j                  fv S rN   )	get_dtypetorchint8uint8)mats    rP   _is_int8_matr\      s     ==?uzz5;;777rR   outalphabetac                    | j                  d      dk(  r| j                  d      dk7  s| j                  d      dk(  rt        j                  | d   |||||      S t        j                  | |||||      S )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r6   r]   )stridesizerX   addmm)inpmat1mat2r^   r_   r`   s         rP   
bias_addmmrh      sh     	

1sxx{a/CHHQK14D{{3q643e$OO;;sD$Cu4HHrR   returnc                 X    dt         fd}dt         fd}dt         fd}t        j                   | j                               xs  | j	                                fd       t        j                   |j                               xs  |j	                               fd       y )Nri   c                 \    t         j                  j                  j                  | d   d      S )Nr6   r   graphsizevarsstatically_known_equalsrb   s    rP   is_row_majorz.check_supported_striding.<locals>.is_row_major   #    ww77q	1EErR   c                 \    t         j                  j                  j                  | d   d      S Nr   r6   rl   rp   s    rP   is_col_majorz.check_supported_striding.<locals>.is_col_major   rr   rR   c                     t        t        j                  j                  j	                  | d   d      xs- t        j                  j                  j	                  | d   d            S rt   )boolr   rm   rn   ro   )rc   s    rP   has_zero_dimz.check_supported_striding.<locals>.has_zero_dim   sQ    GG44T!Wa@ Dww77QC
 	
rR   c                  *    d j                          S )Nz$mat_a must be row_major, got stride 
get_stride)mat_as   rP   <lambda>z*check_supported_striding.<locals>.<lambda>       6u7G7G7I6JK rR   c                  *    d j                          S )Nz$mat_b must be col_major, got stride rz   )mat_bs   rP   r}   z*check_supported_striding.<locals>.<lambda>   r~   rR   )rw   rX   _checkr{   get_size)r|   r   rq   ru   rx   s   ``   rP   check_supported_stridingr      s    F FF F
d 
 
LLU%%'(JL9I,JK 
LLU%%'(JL9I,JKrR   c                    | j                   d   }|j                   d   }| j                   d   }||z  }|}t        j                  | j                  |||      d      }|j                  |||      }	t        j                  ||	t        j
                        }
t        j                  |
d      }|j                  | j                        S )Nr   r6   )r6   r   r   	out_dtype)	shaperX   permutereshapebmmfloat32sumtodtype)abk_splitsmnkk_partsB
a_reshaped
b_reshapedresultreduced_bufs               rP   
decomposeKr      s    	
A	
A	
A8mGAqyyAw7CJ1gq)JYYz:GF))FA&K>>!''""rR   c                   @     e Zd Z fdZdee   dededef fdZ	 xZ
S )DecomposeKSugraphTemplatec                 &    t         |   d       y )Ndecompose_kr@   )super__init__)self	__class__s    rP   r   z"DecomposeKSugraphTemplate.__init__   s     	 	
rR   input_nodeslayoutk_splitri   c                     ddl m} ddlm} d| d}d|} |       5   |       }t	        t        j                  t        |      |      }	t        
| %  ||||	|	      cd d d        S # 1 sw Y   y xY w)
Nr   enable_python_dispatcherr   select_decomp_tabledecompose_k_mm__splitzk_split=)r   r@   r   r   make_fx_graphdescription)
torch._dispatch.pythonr   decompositionr   r   	functoolspartialr   r   generate)r   r   r   r   r   r   r@   r   decompositionsrO   r   s             rP   r   z"DecomposeKSugraphTemplate.generate   s     	D7 	0!
m%' 	02N!!*w?B
 7#' ' $ 	 	 	s   A A**A3)__name__
__module____qualname__r   listr   r   intr   r   __classcell__r   s   @rP   r   r      s<    

&\  	
 
 rR   r   c                   J     e Zd Zdededef fdZdee   dede	f fdZ
 xZS )	ContiguousTemplater@   r   rO   c                 P    || _         || _        || _        t        |   |       y )Nr   )r@   r   rO   r   r   )r   r@   r   rO   r   s       rP   r   zContiguousTemplate.__init__   s.    	& 	 	
rR   r   r   ri   c                     ddl m} ddlm}  |       5   |       }t	        | j
                  |      }t        |   | j                  |||| j                        cd d d        S # 1 sw Y   y xY w)Nr   r   r   r   r   )
r   r   r   r   r   rO   r   r   r@   r   )r   r   r   r   r   r   rO   r   s          rP   r   zContiguousTemplate.generate  sp    
 	D7%' 	02NB
 7#YY'  ,, $ 	 	 	s   AA$$A-)r   r   r   strr   r   r   r   r   r   r   r   r   s   @rP   r   r      sG    
S 
s 
 
&\  
	 rR   r   c                 J    t        j                  | |j                               S rN   )rX   r=   
contiguous)r   r   s     rP   contiguous_mmr     s    88Aq||~&&rR   c                 L    t        j                  | ||j                               S rN   )rX   rd   r   )re   r   r   s      rP   contiguous_addmmr   "  s    ;;sAq||~..rR   r   zcontiguous mmr   zcontiguous addmm)type_promotion_kindr   c                    | j                         }t        j                  |j                         |k(  d        t        j                   j                         j                  dk(  d        t        j                  ||k(  xs7 |t        j
                  k(  xr" |t        j                  t        j                  fv d        t         |      rt        t        j                      d       t        t        j                     |d      }t         |gi ddd	
      \  }}t        j                  j                  rO j                   t        j                  t        j                  fv r# fd}|D cg c]  } t#        |      |       }} t#        t$        j&                        | }	 t)        d      |	d      }
|
S t+         |||      \  }}}} }t-        |      \  }}d}t/         |g|      }t0        d   d| d| d| xx   dz  cc<   t2        j5                  d||| j                         |j                         |       g }t-        |      \  }}t6        }i }|
t8        }d|i}g }i }t;               r"|j=                  |       |r|||j>                  <   ||rtA        |d      rtC        |||      r|j=                  tD               t        jF                  dk(  }|stC        |||d      s[|j=                  tH               tK         ||      r|j=                  tL               tO         ||      r|j=                  tP               |j=                  tR               |jU                  tV        jX                  j[                  ||d|             |@|r>t]        ||||      r0t_        d      r%ta        jb                  |||je                                |5|r3tg        ||||      r%ti        jj                  |||je                                |5|r3tm        ||||      r%to        jp                  |||je                                |2ts        | |      r%tu        jp                  |||je                                 |g}||rtA        |      rt        jv                  jx                  j{                  |      rt}               rg }t;               r|j=                  d       t        |      }|jU                  tV        jX                  j[                  |tH        gd             t         |||||||t               dd|      }t        jv                  jx                  j                  |      s*|#t        |      dkD  r|D cg c]	  }||v s| }}n|d| }|Mt        j                  D ]:  }|j=                  t        |      j                  |je                         |             < d}|0t        jv                  jx                  j                  rt         |      }t        j                  |||je                         |      x}r|S t        |||je                         ||       S c c}w c c}w )!z_
    Lowering for autotuning aten.mm with different backends (Aten, Triton, CUTLASS, etc.)
    Nc                       y)Nzinput dtypes must be the same r   rR   rP   r}   ztuned_mm.<locals>.<lambda>7      rR   cudac                       y)Nz$out_dtype is only supported for CUDAr   r   rR   rP   r}   ztuned_mm.<locals>.<lambda>;  r   rR   c                       y)NzFout_dtype must be the same as input dtype or fp32 for fp16/bf16 inputsr   r   rR   rP   r}   ztuned_mm.<locals>.<lambda>C  r   rR   r   TF)argskwargs	broadcastr   convert_input_to_boolc                 H    t        j                  | j                  d      S )NF)use_compute_types)r   to_dtyper   )xrf   s    rP   	_to_dtypeztuned_mm.<locals>._to_dtyped  s    ||AtzzUKKrR   dotr6   r   r   r=   r   aten_mm_infozaten.mm__zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   check_max_autotune
exhaustiver   )threshold_multipleoutput_layoutkwarg_overrides	extern_mmzmm-ah
   )top_kalways_included)best_config_future)KrW   rX   r   
get_devicetyper   float16bfloat16r<   r!   aten	unsqueezer%   inductor_configtritoncodegen_upcast_to_fp32r   r"   r   r   r#   r9   r7   r    r   loginfoaten_mmaten_mm_dtyper-   appenduidr4   r2   decompose_k_subgraph_templatemax_autotune_gemm_search_spacemm_templater5   persistent_tma_mm_templater3   .blackwell_ws_persistent_device_tma_mm_templatemm_contiguous_subgraph_templateextendr   choicesget_template_configsr1   r+   r   add_cutlass_gemm_choicesnodesr.   r   add_ck_gemm_choicesr/   r   add_choicesr0   r   	_inductorr   run_autoheuristicr   lenmm_autoheuristicr   collect_autoheuristicexternal_matmulrQ   bindremote_gemm_autotune_cacher   r   maybe_autotune_remoter&   )rf   rg   r   r   input_dtyper   r   r   r   mul_pointwisedot_reductionr   r   r   static_shape
is_nonzeror@   kernel_inputsr  aten_handleraten_extra_kwargstemplates_to_user   is_exhaustiver   r    num_choices_before_extra_configs
ah_choiceschoicer   boxs   `                              rP   tuned_mmr   .  s    
 nn&NN+3	
 	OO""f,:	
 	$ U]]* CEMM5>>#BB\	
. t$(r2(q1% $"'
f !!88TZZMMNNL
 >

L ;??Q-N9-a0?D?/sww/6-u-mQ? #*d6Y#Aq!VT4  2&9L*D #D$<9EM ^xs!A3as3494HHY			 #%G1&9L*'.L(*$()4HJ13O-0AOL,,- 	4@!!Q*##$AB (FF,V 6q!QST U##K0&tTH ''(BC0t6R ''(VW ?@NN			&&+	 	' 	
 	 Aq1%66V]002	
 Z,@Aq,Q**7FM<O<O<QRZ,EfaQRTU,V&&w8K8K8MN264F##!	
 ,K'OO""44T:dO """;/+.w<(II** 		
 &O+

 %%;;DA%#j/A*=
 18Pf6Z;O6PP!"C#CD 00 	ANN+A.33M4G4G4I6R	
 U__33NN -T48"88g}**,f s  
$- M @b Qs   +Y2	Y<Yc          	      
   t        | ||t        j                        \  }}}}} }d}t        d   d| d| d| xx   dz  cc<   t        j                  d|||| j                         |j                         |       t        |      \  }}|xr |xr t        ||||      }	g }
t        | |gt        j                        }g }t               r|j                  t               |r#t        |d	d
      r|j                  t               |
j                  t         j"                  j%                  |||             |	r3t'        |      r(t)        j*                  |
||j-                         d	d	       t/        ||
|j-                         |      S )Nr   int_mmr   zaten._int_mm_r   r6   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   TF)enable_int32r   fuseablenon_fuseable)r9   rX   int32r   r   r   rW   r7   r1   r    r-   r   aten__int_mmr4   r   r  r   r  r  r+   r   r  r  r&   )rf   rg   r   r   r   r   r@   r  r  use_cutlassr  r  r  s                rP   tuned_int_mmr*    s    #*d6U[[#Aq!VT4 D^}QCq1QC89Q>9HH^			  2&9L*W:W2FvqRSUV2WK"$G #D$<5;;GM IK-)Te 	, NN			&&}6FM *4066V]002TPT	
 %T7M4G4G4I6RRrR   )r_   r`   r   c          	         t        ||      r|dk(  rd}nt        t        j                     ||       }|dk(  rd}n8t        t        j                     |t        t        j                     ||            }t        t        j
                     ||      S t        ||| |      \  }}	}
}}}}t        |      \  }}d}t        |||gt        ||            }g }t        d   d| d|	 d|
 xx   d	z  cc<   t        j                  d
||	|
|j                         |j                         |       |r t        j                  s|t        j                   slt        | ||gt        ||            }|j#                  t$        j&                  j)                  |t*        g|             t-        |||j/                         |      S g }t1               r|j#                  t2        t*        g       |r}t5        |d      rp|j7                  t8               t;        |||      r|j7                  t<               t?        |||      r|j7                  t@               |j7                  tB               |j#                  t$        j&                  j)                  |||             |rEtE        |||	|
      r7tG        |      r,tI        jJ                  |||j/                  g d      ||       |r=tM        |||	|
      r/tO        jP                  |||j/                  g d      ||g d       tS        |||      r)tU        jV                  |||j/                         ||d       t-        |||j/                         |      S )zb
    Lowering for autotuning aten.addmm with different backends (Aten, Triton, CUTLASS, etc.)
    r   r   rd   )r_   r`   )scalarsr   zaten.addmm_r   r6   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sFr   r   )r6   r   r   )reorder)r   r   r6   )r_   r`   input_reorderT)r_   r`   has_bias),r<   r!   r   mulr=   addr9   r7   r    dictr   r   r   rW   r   max_autotunemax_autotune_gemmr  r   r  r  
aten_addmmr&   r  r-   aten_bias_addmmr4   r   r   r5   r   r3   r   "addmm_contiguous_subgraph_templater1   r+   r   r  r.   r   r  r0   r   r  )re   rf   rg   r_   r`   r   arg1arg2r   r   r   inp_expandedr  r  r@   r  r  r  s                     rP   tuned_addmmr;  G  s-   
 t$19DTXX&tS1DA:DTXX&ui.@t.LMD"4.. 18dCPV0W-Aq!VT41&9L*D #	tT"Du4,HM #%G ^{1#Qqc1#671<7HH\			 ))_-N-N
 '$t%d'C
 	II**	
 )w8K8K8MvVV IK* =>)&UK,"4VD##$>?,T4vN##$RS BC NN			&&}6FM
 	 Aq1%66 	2	
 *61a;** 	2#		
 VT40##!	
 %T7M4G4G4I6RRrR   )r   r   c                   ddl m}  || ||      \  } }}| j                         \  }}|j                         \  }}	|j                         \  }
}t        j                  j
                  j                  ||      }t        j                  j
                  j                  d|z  |
      }|6ddlm}  ||j                         |r|n|j                         ||g|dg      }n	|J d       t               rt        j                  | ||f||      gng }||z  dk7  r6t        ||||      r(t        d      rt!        j"                  ||| ||gd	d	
       t%        d|| ||f|      S )Nr   )r)   r   )FixedLayoutr6   z,out_dtype is ignored if layout is specified.r   sparse_semi_structured_mmTr$  ) torch._inductor.select_algorithmr)   r   r   rm   rn   check_equals_and_simplifytorch._inductor.irr=  r   rW   r-   aten__sparse_semi_structured_mmr  r1   r+   r   r  r&   )rf   	mat1_metarg   r   r   r)   m1k1m2r   k2r   r   r   r=  r  s                   rP   tuned_sparse_semi_structured_mmrH    su    @ +4DAD)T]]_FB EBMMOEB	222r:A	221r62>A~2OO"I(8FF	
  P"PP  !"	 ,00y$'9 1 	
   	
A
 Aq1 ;<66VdD)4tRV	
 %#WtY.Ev rR   szc                 F    t        |       dk(  xs t        d | D              S )Nr   c              3   p   K   | ].  }t         j                  j                  j                  |d        0 yw)r6   Nrl   ).0ds     rP   	<genexpr>z)_is_tensorwise_scaling.<locals>.<genexpr>  s,      !;<00A6!s   46)r  all)rI  s    rP   _is_tensorwise_scalingrP    s+    GqL S !@B!  rR   	transposec                 h    |rdnd}t         j                  j                  j                  | |   d      S )Nr   r   r6   rl   )rI  rQ  idxs      rP   _is_rowwise_scalingrT    s,    !bC7733BsGQ??rR   	tensor_sz	tile_sizec                     |rdnd}|rdnd}t         j                  j                  j                  | |   ||         xr: t         j                  j                  j                  | |   t	        ||   |            S )Nr6   r   r   rm   rn   ro   r,   )rI  rU  rV  rQ  lhsrhss         rP   _is_blockwise1xTILESIZE_scalingr[    sq     !aC!aC7733
33 
''


2
2
333rR   c                     t         j                  j                  j                  | d   t	        |d   d            xr: t         j                  j                  j                  | d   t	        |d   d            S )Nr      r6   rX  )rI  rU  s     rP   _is_blockwise128x128_scalingr^    sd    7733
1wy|S) V
''


2
22a5')A,PS:T
UVrR   t
scale_sizescaling_typec                 X   |xt         j                  k(  r t        |      S xt         j                  k(  r t	        ||      S xt         j
                  k(  r t        || j                         d|      S t         j                  k(  rt        || j                               S 	 t        d|       )Nr]  Unsupported scaling type )r   
TensorWiserP  RowWiserT  BlockWise1x128r[  r   BlockWise128x128r^  AssertionError)r_  r`  ra  rQ  s       rP   is_desired_scalingri    s     #[##)*55 [  &z9=='[''2AJJL#y  ))/
AJJLII #<\N!KLLrR   c                 t    | xt         j                  k(  r yt         j                  k(  ry	 t        d|  d      )Nr]  rc  z in get_tile_size)r   rg  rf  rh  )scale_options    rP   get_tile_sizerl  -  s<    
)[))'' +L>9JK rR   r|   r   scale_a_sizescale_b_sizec                     t         D ](  \  }}t        | ||      st        |||d      s$||fc S  t        d| d|       )NT)rQ  z1Inductor Triton does not support scale_a.shape = z, scale_b.shape = )scaling_pairsri  rh  )r|   r   rm  rn  scale_option_ascale_option_bs         rP   get_scaling_optionsrs  9  sc     +8 2&<
 nPTU!>11	2 
;L>I[\h[ij rR   c	           	         t        | |||      \  }	}
}}} }t        d   d|	 d|
 d| xx   dz  cc<   t        j                  d|	|
|| j	                         |j	                         |       d}t        | |       t        ||      \  }}|s| |||g}nt        |      }| ||||g}t        |dd|	      }g }g }i }t               r3|j                  t               t        ||
      |t        j                  <   t        |      \  }}|j                  t        j                   k(  rn|rkt#        |dd      r\t        |      }t%        | ||      r|s|j&                  |j&                  }}t)        | |||      \  }}|j*                  |d<   |j*                  |d<   |t,        v r1|t,        v r)|j                  t.               ||t.        j                  <   n`|t0        v rM|t0        v rEt3        |      |d<   t3        |      |d<   |j                  t4               ||t4        j                  <   nt7        d      t9        | ||      r*|s(|j                  t:               ||t:        j                  <   |j                  t<               ||t<        j                  <   |j?                  t@        jB                  jE                  ||||             |j                  t        j                   k7  rtG        ||||      S |r@tI        ||	|
|      r2tK        |      r'tM        jN                  |||jQ                         |       |r3tS        ||	|
|      r%tU        jV                  |||jQ                                tG        |||jQ                         |      S )a9  
    Performs an optimized matrix multiplication where scaling factors are applied
    to the inputs and/or output.

    Args:
        mat1 (Tensor): First input matrix
        mat2 (Tensor): Second input matrix
        scale1 (Tensor): Scale factor applied to mat1 (supports broadcasting)
        scale2 (Tensor): Scale factor applied to mat2 (supports broadcasting)
        bias (Tensor, optional): Optional bias tensor to add to the result
        layout: Layout hint for optimization

    Returns:
        Tensor: The result of the scaled matrix multiplication
    r   r   zaten._scaled_mm.default_r   r6   z_Tuned aten._scaled_mm.default: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%s	scaled_mmr   )mat1_idxmat2_idxr   )r   use_fast_accumTF)enable_float8r   )USE_FAST_ACCUMr   SCALE_RECIPE_ASCALE_RECIPE_BTILE_SIZE_ATILE_SIZE_BzpInductor Triton does not support scaling options that are present in both epilogue scaling and main loop scalingr   )rx  ),r9   r   r   r   rW   r   r)   r    r-   r   aten__fp8_mmr2  r   r7   r   rX   r   r4   r5   r   rs  valueepilogue_scaling_types.scaled_mm_device_tma_epilogue_scaling_templatemain_loop_scaling_typesrl  /scaled_mm_device_tma_main_loop_scaling_templaterh  r3   r   r   r  r   r  r  r&   r1   r+   r   r  r  r.   r   r  )r|   r   scale_ascale_bbiasscale_resultr   rx  r   r   r   r   r@   scale_a_realscale_b_realr   	bias_realr  r  r  r   r   r  
overridersrm  rn  rq  rr  s                               rP   tuned_scaled_mmr  J  s   8 %,uVy%!Aq!VUE ^7s!A3asCDIDHHi			 DUE*!/!AL, e\<@"4(	e\<K #a!yM #%G IKO-,0-
(() 'v.MAz 	&duU8
 #5%vFt)5););\=O=O,L-@ulL.*NN ,:+?+?J'(+9+?+?J'( "88"&<< ''(VW   N R RS "99"&==,9.,I
=),9.,I
=) ''(WX   O S ST %G  .eU&Q##$RS JNNO 	,+5( NN			&&+	 	' 	
 }}%(wVLL 	 Aq1%66!)		
 *61a;**7FM<O<O<QR$T7M4G4G4I6RRrR   indexc                 f    t         j                  j                  | xs d      }|j                  dk  S )Nr      )rX   r   get_device_propertiesmajor)r  propss     rP   _is_sm7x_or_older_gpur    s)    JJ,,UZa8E;;!rR   c                 &    t        d | D              S )Nc              3   <   K   | ]  }t        |t                y wrN   )
isinstancer   )rL  dims     rP   rN  zdims_are_int.<locals>.<genexpr>  s     4z#s#4s   )rO  )dimss    rP   dims_are_intr    s    4t444rR   r   c           	          t        | ||||      \  }}}t        |||g      sy t        | |      \  }}fd}d } ||||| |||      }t        ||||||	      }|
|j	                  |
|      S |j                         S )Nc                 V   t               }|j                  d|        |j                  d|       |j                  d|       |j                  d|j                  j                  d       |j                  d|j                  j                  d       t	        |d|       t	        |d	|       |j                  d
|j                  j                         d       |j                  d|j                  j                         d       dk(  r t        ||j                  j                         |S )Nr   r   r   
mat1_dtypeT)is_categorical
mat2_dtyperf   rg   mat1_iscontigmat2_iscontigr=   )r   add_featurer   r   r	   is_contiguousr
   )	r   r   r   rf   rg   mat1_stridemat2_stridecontextr@   s	           rP   get_contextz%mm_autoheuristic.<locals>.get_context	  s   +C#C#C#L$++*;*;DQL$++*;*;DQGV[9GV[9T[[668 	 	
 	T[[668 	 	
 4<"7DKK,=,=>rR   c                       y rN   r   r   rR   rP   fallbackz"mm_autoheuristic.<locals>.fallback  s    rR   )r  r  r   r  r@   augment_contextprecondition)r   )get_size_hintsr  get_size_hints_stridesr   get_top_k_choices_callerget_choice_caller)rf   rg   r   r   r   r  r@   r   r   r  r   r   r  r  r  r  r  autoheuristics         `           rP   r  r    s     T4Aq1GAq!Aq	"5dDAK& !Q4{KHG0!M 55? 6 
 	
 **,,rR   c                    t        |t              rt        |t              s^t        j                  j                  j                  | j                         t        j                  j                  j                        \  }}t        |t              rt        |t              s^t        j                  j                  j                  |j                         t        j                  j                  j                        \  }}|||fS )Nr  )r  r   r   rm   rn   
size_hintsr   rX   r	  r   unbacked_symint_fallback)rf   rg   r   r   r   s        rP   r  r  3  s    aZ3%7!!,,MMO__++DD - 
A
 aZ3%7!!,,MMO__++DD - 
A a7NrR   c                 d   | j                   j                  }|j                   j                  }||g}g }|D ]p  }t        |t              sMt        j
                  j                  j                  |t        j                  j                  j                        }|j                  |       r |d   |d   fS )Nr  r   r6   )r   rb   r  r   r   rm   rn   r  rX   r	  r   r  r   )rf   rg   r  r  stridesstrides_hintsrb   s          rP   r  r  B  s    ++$$K++$$KK(GM %&#&WW%%00//HH 1 F 	V$% ]1---rR   )ri   NrN   )F)NNNFN)NN)r   loggingtypingr   r   r   rX   torch._dynamo.utilsr   +torch._inductor.autoheuristic.autoheuristicr   1torch._inductor.autoheuristic.autoheuristic_utilsr   r	   r
   r   )torch._inductor.codegen.cpp_gemm_templater   *torch._inductor.remote_gemm_autotune_cacher   torch._inductor.virtualizedr   r   "torch.fx.experimental.proxy_tensorr   torch.nn.functionalr   torch.torch_versionr    r   r   r   codegen.cuda.gemm_templater   r   ,codegen.rocm.ck_tile_universal_gemm_templater   'codegen.rocm.ck_universal_gemm_templater   codegen.subgraphr   r   irr   r   r   r   r  r    loweringr!   r"   r#   r$   r%   select_algorithmr&   r'   r(   r)   r*   utilsr+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   	mm_commonr7   r8   r9   r:   r;   r<   r   __version__triton_version
has_tritonImportError	getLoggerr   r   r   primsversionhipr   r   r  r  r   cacherQ   r=   r^   r   	dtype_outr   rd   r5  _int_mmr(  _sparse_semi_structured_mmdefaultrB  
_scaled_mmr  r\   rh   r   r6  r   r   r   r   r   r   r  r7  r   r*  r;  rH  rd  re  rf  rg  rp  r  r  rw   rP  rT  r   r[  r^  Tensorri  rl  tuplers  r  r  r  r  r  r  r   rR   rP   <module>r     s     ' '  ( T  F F . 6 + , > U M D E 8 8 *      !&"4"45NJ
 g!yy~~		
 		!n&?  ,
 
.	/'+" ,		 :;  2@	0	 ;<2 . 3A	1	 <=3 / 2@	-	 NO2 . " " UXX|
M"	HH	!!	  	KKdjjnn
 "	MM$$,,2B2B #5	$$$//77	#  "	*8K8K
8 (,11 I4 %Z6#  0  F !: ; ) D'/ #5_m#  &8*,<& "
 4775d4 d 6dN 4<<T:'+ -S ;-S` 4::48*+!D sS 9sSl 422M(,T- N-b [334+--.!=!=>!;!;<	 &00+2E2EF &55{7S7ST s t @C @D @T @
			(+	8<			VS VS VT V 	M
MM M 	M
 
M*	3 	 ,, ,,	
 ;#$" 4??**E 
^S F^SB # 4  
5  :- C=:-z.w  !'*NJs   S S10S1