
    qik                        d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ ddl m!Z!  ed      Z" ed      Z#e$ejJ                  df   Z&e$e'ejP                     e'ejJ                     f   Z)ejT                  jW                  e,d      Z-d dlm.Z.m/Z/ erd dl0m1Z1m2Z2 dejJ                  de	ejJ                     fdZ3dejJ                  de	ejJ                     fdZ4dejJ                  de5ejJ                  e6f   de	ejJ                     fdZ7dejJ                  de5ejJ                  e6f   de	ejJ                     fdZ8 e jr                  d       G d d             Z:e
d d!d"ejJ                  d#ejJ                  d$ed   de	e$e)e)f      f
d%       Z;e
	 dAd d!d"ejJ                  d#ejJ                  d$ed&   de$e)e)f   f
d'       Z;	 dAd d!d"ejJ                  d#ejJ                  d$e<de	e$e)e)f      f
d(Z; G d) d*      Z=d+e'ejP                     d,e'ejP                     d-e'ejP                     d.e'ejP                     d/e'e'ejJ                        d0e'e'ee'ejJ                     gejJ                  f         de5ejP                  ejJ                  f   fd1Z>d2ed   de	e:   fd3Z?d4ejJ                  de5ejP                  e6f   d5ee@   de6fd6ZAd7e@de	e6   fd8ZBd9eejJ                  e6f   de6fd:ZC e jr                  d       G d; d<             ZD e jr                  d       G d= d>             ZEd?ed   de	eE   fd@ZFy)B    N)Counterdefaultdict)Callable)LiteralOptionaloverloadTYPE_CHECKINGTypeVarUnion)config)index_vars_no_squeeze)sympy_product
sympy_subs)
OrderedSet)Identity)	try_solve)symbol_is_typeSymT   VTU.loop_tiling)FloorDivModularIndexingFusedSchedulerNodeSchedulerNodeexprreturnc                    | j                         ryt        | t              ryt        | j                        dk(  sJ t        t        | j                              }t        | t              r;t        t        j                  | j                  d   | j                  d         |      }n t        t        j                  | d      |      }|r|d   j                         sy|d   S )zw
    Given an expr with a single free symbol, solve for a constant relation that would make
    this expression 0.
    Nr   r      )is_constant
isinstancer   lenfree_symbolsnextiterr   r   sympyEqargs)r    free_symbolouts      b/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/tiling_utils.pysolve_for_zeror0   %   s    
 	D(	#t  !Q&&&tD--./K$(1tyy|<kJq);7c!f((*q6M    c           	      
   t        | j                        dk(  ryt        t        | j                              dt        j
                  dt        t        j
                     ffd}| j                  t              s| j                  t              s ||       S g }g }t        j                  j                  |       D ]x  }t        |t        j                        rKd}|j                  D ]5  }t        |      }||j!                         sJ d}|j#                  |       7 |rf y|j#                  |       z |syt%        |      }	 ddt        j
                  d	t        j
                  d
t        t        j
                     dt        j
                  fd}	|j'                  t        |	      j'                  t        |	      }
 ||
      }|rt)        ||i      dk7  ry|j#                  |       t        t+        |            dk(  r|d   S y)a  
    Giving an expr with a single free symbol, try to find a tiling that would
    make the expression coalesced with respect to that symbol.

    Tiling an expression `x` by `y` means that the expression will now be indexed
    by both the original (x) and by (x * y). So we are looking for a
    multiplicative factor that will make ((x + 1) * y) - (x * y) == 1.

    To simplify things for sympy, we'll try just x * y == 1, check x(1) and x(0).
    r   Nr    r!   c                    | j                  t              s| j                  t              rJ t        | j                        dk7  ry t        t        j                  | d            }|r|d   j                         sy |d   S )Nr   )	hasr   r   r&   r'   r   r*   r+   r$   )r    r.   r-   s     r/   _solve_simple_exprz,solve_for_tiling.<locals>._solve_simple_exprK   sk    88O,TXXh5GGGt  !Q&q);7#a&,,.1vr1   FTxyzc                     | |z  S N )r6   r7   r8   s      r/   indexing_div_repz*solve_for_tiling.<locals>.indexing_div_repy   s    
 1ur1   r   r:   )r&   r'   r(   r)   r*   Exprr   r4   r   r   Add	make_argsr%   Mulr,   r0   r$   appendsumreplacer   r   )r    r5   required_valueseq_1_expressionsargseenmul_argr.   	eq_1_exprr<   eq_1_expr_simplifiedr-   s              @r/   solve_for_tilingrK   :   s    4"tD--./K 0D  88O$TXXh-?!$''O
 yy""4( )c599%D 88 ,$W-;(((&&s+, ##C(%)( $%I
 #':::: EJJ 
	 %,,_>NOWW" 1
2C*Yc(:;q@3
:o&'1,q!!r1   index
var_rangesc                 *   i }| j                   D ]  }||v rd||<   t        |      ||<    t        | |      }|D ]1  }|| j                   vrd||<   	 t        | |      }||k(  r|c S d||<   3 y# t        $ r t        j                  d| |       Y Ww xY w)z
    Try to find the variable that this index is broadcast over.
    A broadcast pattern is one where consecutive values of a variable
    access the same memory location (e.g., x // 10).
    r   r   zero division error %s %sN)r'   get_hintr   ZeroDivisionErrorloop_tiling_loginfo)rL   rM   	variablesv
zero_indexnew_vals         r/   find_broadcast_varrX      s     *,I '
?IaL#A;IaL	' E9-J E&&&	!	 	2G
 j H	!  ! 	  !<eYO	s   A// BBc                    t         j                  j                  |       }|D ]
  }||v s|c S  i }| j                  D ]  }||v rd||<   t	        |      ||<    t        | |      }|D ]<  }d||<   	 t        | |      }||z
  dk(  rd||<   t        | |      |z
  dk(  r|c S d||<   > y# t        $ r t        j                  d| |       Y bw xY w)z;
    Try to find the symbol which coalesces this index
    r   r   rO   r#   N)	r*   r>   r?   r'   rP   r   rQ   rR   rS   )rL   rM   top_level_termsrU   rT   rV   rW   s          r/   find_coalesced_varr[      s    ii))%0O H
 *,I '
?IaL#A;IaL	' E9-J 	!	 	2G Z1$IaL 5),w61<	!  ! 	  !<eYO	s   1B(( C
CT)frozenc                       e Zd ZU dZeej                     ed<   eej                     ed<   eej                  ee
   f   ed<   eej                  ee
   f   ed<   eej                  ef   ed<   y)FusedNormalizedReadsWriteszO
    Normalized reads and writes for nodes in the same FusedSchedulerNode.
    
index_varsreduce_varsreadswritesrM   N)__name__
__module____qualname____doc__r   r*   Symbol__annotations__dictr=   strintr;   r1   r/   r^   r^      sk     5<<((ELL))

JsO+,,Z_,--U\\3&''r1   r^   nr   pointwise_numel	red_numelnone_if_not_divisiblec                      y r:   r;   rl   rm   rn   ro   s       r/   get_pw_red_splitsrr      s     58r1   Fc                      y r:   r;   rq   s       r/   rr   rr      s     +.r1   c                    | j                         s%t        | j                  j                  d         |k(  r^| j                  j                  | j                  j                  d   f| j                  j
                  | j                  j                  d   ffS t        | j                  j                  d         ||z  k(  sJ t        | j                  j                  d         dz
  }d}|dk\  r0|| j                  j                  d   |   z  }||k(  rn|dz  }|dk\  r0|dk\  rr| j                  j                  d   d| }| j                  j                  d| }| j                  j                  d   |d  }| j                  j                  |d  }	||f|	|ffS |ry | j                  j                  | j                  j                  d   f| j                  j
                  | j                  j                  d   ffS )Nr   r   )is_reductionr   _bodysizes	iter_varsr`   r&   )
rl   rm   rn   ro   iprod	pw_splitsrx   
red_splitsred_varss
             r/   rr   rr      s    	~~=q)9:oMWWa 01WW  !''--"23
 	

 q)*o	.IIIIAGGMM!!AD
q&a ##9	Q	 q& 	AvGGMM!$Qq)	GG%%a*	WW]]1%ab)
77$$QR(9%*'=== WWa 01WW  !''--"23
 	
r1   c            	       \    e Zd ZdZded   fdZdeeef   fdZdedede	eeef      fd	Z
y
)NodeSplitGetterz_
    Finds a Pointwise, Reduction Split that compatible with all nodes in a SchedulerNode.
    noder   c                    || _         |j                  d   d   | _        |j                  d   d   | _        t	        t
              | _        d| _        t               | _        |j                  d   }t        |j                               D ]W  }t        |t        j                  j                  j                        s3t!        || j                  | j                  d      }|0| j                  j#                  |j$                  j&                         |\  \  }}\  }}t        j                  j(                  j*                  j,                  j/                  |||f| j                        \  }}| j                  t1        |         j#                  t3        |             |dk7  rt5        |      f| _        t3        |      t3        |      f}| j                  j#                  |       Z t               | _        y )Nr   r   r;   T)ro   )r   grouprm   rn   r   r   pw_split_optionsreduction_splitall_node_sizesreversed	get_nodesr%   torch	_inductor	schedulerr   rr   addrv   rw   codegensimd
SIMDKernelprepare_split_iteration_lengthsr&   tupler   seen_pw_splits)	selfr   fused_grouprl   maybe_splits_n_pw_splitsn_red_splitsn_sizes	            r/   __init__zNodeSplitGetter.__init__'  s    	+/::a=+;%)ZZ]1%5>I*>U&(?I|jjm$..*+ $	,Aa!:!:!H!HI
 -4''tL ###''62>/Q/q, '',,77WW+|!<dnn &K !!#k"2377k8JK r!(5l(C'E$K(%*=>F##F+I$	,L 2<r1   r!   c                 ~   t        | j                        dk(  rt        t        | j                              S t	        | j
                  j                               }t        |dd      D ]  }| j
                  |   D ]&  }| j                  || j                        x}s"|c c S  | j
                  |   D ]o  }t        t        |      dz
        D ]S  }t        |d| t        |||dz          fz   ||dz   d z         }| j
                  t        |         j                  |       U q  | j                  f| j                  ffS )zI
        Get a compatible pointwise, reduction split of the node
        r   r   r#   N)r&   r   r(   r)   maxr   keysrange	try_splitr   r   r   r   rm   rn   )r   max_pw_splitpw_split_lenpw_splitr.   ry   	new_splits          r/   get_node_splitszNodeSplitGetter.get_node_splits]  sV   
 t""#q(T001224005578!,26 	IL 11,? ..43G3GHH3HJ
 !11,? Is8}q01 IA % 1(!a!e)<=?@"1q57+,!I
 ))#i.9==iHII	I  %%'$..):;;r1   pwredc                    ddl m}m} || j                  v ry| j                  j	                  |       | j
                  D ]  \  }}	 ||z   }||f}|j                  ||      \  }	}
t        |
      dk(  sJ |	dt        |       }t        t        j                  j                  |            }||k7  sq| j                  ||      x}s|c S  ||fS # |$ r Y  yw xY w)zs
        See if this split is compatible, and potentially returning a longer split
        than the input.
        r   )	CantSplitr   Nr#   )torch._inductor.codegen.simdr   r   r   r   r   _split_iteration_rangesr&   r   	itertoolschainfrom_iterabler   )r   r   r   r   r   n_pwn_redgroupslengthssplitsgetterspw_group_splitsflattened_pw_splitsr.   s                 r/   r   zNodeSplitGetter.try_splitx  s     	G$$$#.. 	KD%c-","D"DVW"U w<1$$$$Ys2w/O
 #(	(E(Eo(V"W"b(..)<cBB3BJ#	& 3w  s   CCCN)rc   rd   re   rf   r   r   r   Splitr   r   r   r;   r1   r/   r   r   "  sX    4>9:4>l<ue|!4 <6E  (5;N2O r1   r   rx   r}   norm_pw_varsnorm_red_vars
new_rangesreturn_getters_groupsc           	         t        d |D              }t        j                  d|       }d}t        |       dk(  rt        |      dk(  ri S t        |      t        ||z         k(  sJ g }	|D ]'  }
|	j	                  |
D cg c]
  } ||       c}       ) i }t        t        |	| |fd            D ]f  \  }\  }
}t        |
      t        |      k7  r|dk(  sJ t        |      dk(  sJ 8|j                  t        |
|      D ci c]  \  }}||
 c}}       h d}i }t        |||z   d      D ]l  \  }}g }t        t        |            D ]  }|j	                  ||          |dz  } d}t        t        |      dz
  dd      D ]  }||z  |||   <   ||   |z  } n |j                         D ci c]  \  }}|t        ||       c}}S c c}w c c}}w c c}}w )zBMaps original variables to expressions using normalized variables.c              3   2   K   | ]  }t        |        y wr:   )r&   ).0ss     r/   	<genexpr>z$apply_var_mapping.<locals>.<genexpr>  s     .a3q6.s   zv_0:r   T)strictr   r   )rB   r*   symbolsr&   rA   	enumeratezipupdater   itemsr   )rx   r}   r   r   r   r   num_vars	flat_varscountapply_groupsr   giter_vars_to_flat_varsry   	var_grouprU   flat_vars_to_new_vars	new_rangenew_var
range_varsr   rz   ks                          r/   apply_var_mappingr     s'   . .:..HXJ/0IE
9~s8}1	z?c,">????L& ;59aQy\9:;  !*L9h/=" 
PE9
 u:Y'6M6y>Q&&&%%E98M&N1q!t&NO
P E!L=0 '	7 
s9~& 	Ai./QJE	 s9~)2r2 	'A3:T>!*Q-0Q<$&D	'' +002Aq 	
:a.// = : 'O"s   =G
G0Gr   c           
        ) t        t              }t        t              }| j                         }| j                         }t               }t               )|D ]O  }t        j
                  j                  j                  ||      r)j                  |       ?|j                  |       Q t        )fd| j                  j                  D              }| j                  d   d   }| j                  d   d   }	t        d ||	fD              ryt        |       j                         \  }
}t        |
|d      \  \  }}}t!        | j#                               D ]T  }t%        |t&        j(                  j                  j*                        s3|j,                  }|j.                  r yt        t              }t        t              }|D ],  }|j1                  |      D ]  }||   j                  |        . |D ],  }|j3                  |      D ]  }||   j                  |        . |s|st5        |||	      \  \  }}\  }}|
|z   }||f}t&        j(                  j6                  j8                  j:                  j=                  |||	      }t&        j(                  j6                  j8                  j:                  j?                  ||      \  }}tA        ||||||      }dtB        jD                  d	tB        jD                  fd
}|jG                         D  !ci c]  \  } }!tI         ||       |      |! }"} }!|jG                         D #!ci c]  \  }#}!tI         ||#      |      |! }$}#}!|"jG                         D ]  \  }}%||xx   |%z  cc<    |$jG                         D ]  \  }}%||xx   |%z  cc<    W |jG                         D &!ci c]0  \  }&}!t        j
                  jJ                  jM                  |&|      |!2 }}&}!|jG                         D '!ci c]0  \  }'}!t        j
                  jJ                  jM                  |'|      |!2 }}'}!tO        |||||      }(tP        jS                  d|(       |(S c c}!} w c c}!}#w c c}!}&w c c}!}'w )zjExtracts index variables, reduce variables, read/write expressions, and variable ranges from a fused node.c              3   T   K   | ]  }|j                   vs|j                    ! y wr:   )name)r   depremoved_bufferss     r/   r   z1extract_normalized_read_writes.<locals>.<genexpr>  s%      chho6Us   ((r   r   c              3   v   K   | ]1  }t        |t        j                        xr |j                           3 y wr:   )r%   r*   r=   r$   )r   vars     r/   r   z1extract_normalized_read_writes.<locals>.<genexpr>  s5       
C	$	>S__->)>	>s   79Nrl   )prefixr    r!   c                 0    | j                  t        d       S )Nc                     | S r:   r;   )r6   s    r/   <lambda>zIextract_normalized_read_writes.<locals>.remove_identity.<locals>.<lambda><  s    A r1   )rC   r   )r    s    r/   remove_identityz7extract_normalized_read_writes.<locals>.remove_identity;  s    <<+66r1   zNormalized Fused reads: %s)*r   r   get_buffer_namesget_operation_namesr   graphr   $can_buffer_be_removed_through_fusionr   read_writesra   r   anyr   r   r   listr   r%   r   r   r   rv   indirect_varsget_all_read_exprget_all_write_exprrr   r   r   r   r   r   r   r*   r=   r   r   sizevarssimplify_with_rangesr^   rR   rS   )*r   ra   rb   all_output_namesop_namesoutputsbuf_nameinputsrm   rn   r{   r|   r   r   rangesrl   bodyn_readsn_writesinpr    r.   rx   r   r}   r   r   r   r   r   var_mapr   readrU   n_reads_newwriten_writes_new	buf_namesrw	fused_outr   s*                                            @r/   extract_normalized_read_writesr     s    0;:/FE0;J0GF,,.'')H)|G'1|O$ "77AA(HU)KK!	"   ,,22 F #'**Q-"2O JJqM!,I  #Y/  +D1AACIz -B:c-)!\=6 $.."# E&!U__66DDEww 5@5L6A*6M  	'C..s3 '!!#&'	'  	(C//4 (""3'(	( x=N	>
: K":8\ Z'/OO##((33SS 	 OO##((33KK 	*
)
 $!
	7%** 	7 	7 JQ
>EdAJt,g69
 

 %NN,
q u-w7:
 

  +002 	%OD)$K9$K	%  ,113 	&OD)4LI%L	&IE&P IN@D1--a8!;E  IO@D1--a8!;F  +I 5yA;

s   
Q'=Q-=5Q35Q9addrr   c                 0   g }| j                   D ]B  }|j                  |      }t        |t        j                        r/|2|j                  |       D ddlm} |j                  j                  j                  t        |      t        j                        S )z7
    Score addr according to its approximate size.
    r   r   fallback)r'   getr   r   INDIRECTrA   virtualizedr   r   r   atomically_apply_size_hintr   r   unbacked_symint_fallback)r   rM   r   	var_sizesrU   v_sizer   s          r/   	get_scorer
  ^  s     I %"a/F4FV$	%
 7766i 6+J+J 7  r1   r   c                     t         j                  j                  |       }|sy t         j                  j                  j	                  t        |j                               t        j                        S Nr  )	r   r   try_get_bufferr   r  r   get_sizer   r  )r   bufs     r/   try_get_buf_sizer  r  sT    
''
 
 
*C7766clln%0O0O 7  r1   rU   c                     t        | t              r| S t        j                  j                  j                  | t        j                        S r  )r%   rk   r   r   r   	size_hintr   r  )rU   s    r/   rP   rP   {  s7    !Sww))!f6U6U)VVr1   c                   D    e Zd ZU dZej
                  ed<   eed<   eed<   y)	VarTilingzm
    Tiling of a var by `tiling_factor` that yields additional coalesced mem accesses by `benefit_score`
    r   tiling_factorscoreN)rc   rd   re   rf   r*   rg   rh   rk   r;   r1   r/   r  r    s     
Jr1   r  c                   |    e Zd ZU eej
                  ef   ed<   eej
                  ef   ed<   eed<   dZ	e
e   ed<   y)CoalesceVarAnalysiscoalesced_by_varuncoalesced_addrsnorm_read_writesNsuggested_split)rc   rd   re   ri   r*   r=   rk   rh   r^   r  r   r  r;   r1   r/   r  r    s?    
 5::s?++EJJO,,00+/OXi(/r1   r  
fused_nodec           
      :   t        |       }|y|j                  }|j                  }|j                  }t	               }t	               }t        j                  d |j                         D        d |j                         D              D ]  \  }\  }}	t        |j                  |j                  j                         z
        }
|
r<t        |||	      }|dk(  rOt        ||      }|t        ||      }d}|	D ]X  }t        j                  j!                  |      x}s%t#        |      x}s3|t%        ||      |j&                  j(                  z  z  }Z ||rdndz  }|r||xx   |z  cc<   ||xx   |z  cc<    |st+        |||      S t-        t              }|j                         D ]  \  }}t.        j1                  |j                  d      }|j                  D ]  }||vr|dk(  r||= t3        ||      }d||<   t5        |      }||j7                         r|j8                  sLt;        |      }t        j                  j<                  j?                  |||         sdtA        fd	|||   |z  fD              s||   |xx   |z  cc<     tC        |      dk(  rt+        |||      S d}d}|j                         D ])  \  }}|j                         D ]  \  }}||kD  s||f}|} + |t+        |||      S t+        |||tE        |d   |d   |      
      S )a[  
    Find variables that coalesce the reads and writes and score the total size.

    If uncoalesced memory expressions are found, look for additionally tiling of variables
    which will coalesce memory accesses.

    For instance - for the following expression:

    (32*p0) // 2048

    Tiling p0 by 64 will make this expression coalesced.
    Nc              3   $   K   | ]  }d |f 
 yw)TNr;   r   items     r/   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s     0$$0   c              3   $   K   | ]  }d |f 
 yw)FNr;   r   s     r/   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s     24%2r"  r   r   r#   )r  r  r     c              3   r   K   | ].  }t         j                  j                  j                  |       0 y wr:   )r   r   r   statically_known_lt)r   blockMIN_TILING_BLOCKs     r/   r   z,analyze_memory_coalescing.<locals>.<genexpr>  s1         445EuMs   47)r  r  r  r  )#r   ra   rb   rM   r   r   r   r   boolr'   r   r
  r[   rX   r   r   r  r  mindtypeitemsizer  r   ri   fromkeysr   rK   r$   
is_integerrk   r   r&  allr&   r  )r  r  ra   rb   rM   r  r  is_readmemory_exprr   indirect_exprsizemaybe_coalesced_vartotal_scorer   r  buf_sizetiling_scoresuncoalesced_expr
addr_score	expr_subsrU   single_var_exprr  best_tilingbest_tiling_scorer   tiling_countertile
tile_scorer(  s                                 @r/   analyze_memory_coalescingrA    s     6jA""E$$F!,,J07	/6y-6__0%++-026<<>2. ':))+y
 $$'7'B'B'G'G'II
 j)<190jI &"4[*"M! 	HHww--h777,X666 s8T2SYY5G5GGG	H 	Gq*01[@1k*k9*O':R "-/-
 	
 7B'6JM(9(?(?(A ":$*MM"2"?"?C	!..  	:A
"Q!()99EOIaL,_=M%$002$//.M77##77zRS}U
  ! +Z]m-KL  !]+z9+A 	:":H =Q"-/-
 	
 59K,224 /^ . 4 4 6 	/D*--"Dk$.!	// "-/-
 	
 )+)!+a.+a.BST	 r1   )F)Gdataclassesr   collectionsr   r   collections.abcr   typingr   r   r   r	   r
   r   r*   r   torch._inductorr   torch._inductor.dependenciesr   torch._inductor.utilsr   r   torch.utils._ordered_setr   torch.utils._sympy.functionsr   torch.utils._sympy.solver   torch.utils._sympy.symbolr   r   r  r   r   r   r   r=   r   r   rg   VarsAndRanges_logginggetArtifactLoggerrc   rR   r   r   torch._inductor.schedulerr   r   r0   rK   ri   rk   rX   r[   	dataclassr^   rr   r)  r   r   r   rj   r
  r  rP   r  r  rA  r;   r1   r/   <module>rR     sU     , $ M M   " > ; / 1 . :  CLCL 	ejj#od5<<($uzz*::; ..228]K B K (< *W5:: W(5::*> Wt :: #'

C#8 ejj F#::##'

C#8#ejj#L d#	( 	( $	( 
88ZZ8 zz8 #4=	8
 eM=0128 
8 

 -2	..ZZ. zz. #5>	.
 =-'(. 
. #(	#
#
ZZ#
 zz#
  	#

 eM=012#
Lu upBELL!B5<< B u||$B %	B
 T%**%&B  XtEJJ/?.@%**.L%M NOB 
%,,


"#BJ|
5
6|()|~
**"&u||S'8"9FPQTo(s x} Wejj#o& W3 W d#  $ d#
0 
0 $
0S;<S!"Sr1   