
    qiP0                       U d dl mZ d dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
Z
d dlZd dlmZ d dlZd dlmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZ dd	lm Z m!Z! dd
l"m#Z# ddl$m%Z%m&Z& erd dl'm(Z(m)Z) dZ*da+de,d<   ejZ                   G d d             Z.ejZ                   G d d             Z/ddZ0d dZ1ejd                  d!d       Z3	 	 	 	 	 	 	 	 	 	 d"dZ4 G d de      Z5d#dZ6 G d d      Z7	 	 	 	 d$dZ8	 	 	 	 	 	 d%dZ9y)&    )annotationsN)AnyTYPE_CHECKINGUnion)patch)
OrderedSet   )configselect_algorithm)BufferChoiceCallerLayoutMultiTemplateBufferOperationBufferShapeAsConstantBuffer
StorageBox	TensorBox)KernelInputsMMKernelInputs)SchedulerNode)NullHandlerV)	GeneratorSequencedistributed_autotunedist.ProcessGroup | None_AUTOTUNE_PGc                  .    e Zd ZU dZdZded<   dZded<   y)_DistributedAutotuneStatezA
    State used to track autotuning during a graph_context()
    r   intautotuned_indexautotuned_local_countN)__name__
__module____qualname____doc__r!   __annotations__r"        j/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/distributed_autotune.pyr   r   (   s      OS "#3"r)   r   c                  "    e Zd ZU ded<   ded<   y)_DistributedAutotuneInfor    indexboollocalN)r#   r$   r%   r'   r(   r)   r*   r,   r,   7   s    JKr)   r,   c                     t        j                         r@t        j                         r,t         t         j                  j                  d      at        S y )Npt2_distributed_autotune_pg)pg_tag)distis_availableis_initializedr   distributed_c10d_new_group_with_tagr(   r)   r*   get_autotune_pgr8   =   sI    t22400DD4 E L r)   c                l    t         j                  sJ t        |       }t        |      }t	        | |       y)z
    Finish the distributed autotuning by propagating the autotuning results
    between the ranks and then replacing the placeholder with the real Buffer.
    N)r
   distributed_max_autotune_gemm_autotune_local_nodes_sync_autotune_remote_nodes)	schedulerautotune_resultschoices_by_indexs      r*   schedulerA   I   s5    
 ////,Y7-.9&67r)   c               #    K   t        t        j                  d      t              rJ t        j                  t                      	 d t        j                  t                      y# t        j                  t                      w xY ww)zd
    Wrapped around processing a graph, sets up figuring out which ranks tune
    which shapes.
    F)check_poisonedN)
isinstancer   get_distributed_autotune_stater   set_distributed_autotune_stater   r(   r)   r*   graph_contextrG   T   sg      	((>!   $$%>%@A8	((7((7s   ABA) B)BBc                   t         j                  syt               x}syt        |      dk  ryt        j
                  }|j                  }|xj                  dz  c_        ||j                         z  |j                         k(  }t        ||      t        j                  j                  t        <   |r|xj                  dz  c_        yt        j                  j                   j"                  j%                  t'        | ||            S )z
    Used by an op (like `mm`) to determine if the op should be autotuned
    locally (returns None) or remotely (returns a placeholder Buffer).
    Nr	   )r
   r:   r8   lenr   distributed_autotune_stater!   sizerankr,   current_nodemeta_DISTRIBUTED_AUTOTUNE_KEYr"   torch	_inductorirr   create_DistributedAutotuneBuffer)namechoicesinputslayoutautotune_pgstater-   r/   s           r*   maybe_autotune_remoter[   e   s     //*,,K,
7|q((E!!E	QK$$&&+*:*:*<<E5Mu6ANN12 ##q(#??''.."48 r)   c                  X     e Zd ZU dZded<   	 	 	 	 	 	 	 	 d fdZ	 	 	 	 ddZd	dZ xZS )
rT   z
    A MultiTemplateBuffer which represents a kernel being autotuned on a
    different rank. When `schedule` is called this will be replaced by the
    "real" buffer.
    str_kernel_namec           	     b    t         |   ||| j                  g t        i              || _        y )N)choice_timings_fnunfiltered_choicesallowed_prologue_inps)super__init___dummy_choice_timingsr   r^   )selfkernel_namerW   rX   	__class__s       r*   rd   z#_DistributedAutotuneBuffer.__init__   s:     	"88!",R. 	 	
 (r)   c                    t         N)NotImplementedError)rf   _hint_overrides     r*   re   z0_DistributedAutotuneBuffer._dummy_choice_timings   s
    
 "!r)   c                   ddl m} t        j                  t        j
                  dd      5  t        g | j                        }t        | j                  t              sJ |j                  | j                  |      } || j                  |g|j                         | j                        }t        |t              sJ |cddd       S # 1 sw Y   yxY w)zu
        Given a _SerializedChoice (autotune results from another rank)
        compute the final TensorBox.
        r	   )autotune_select_algorithmr>   N)r   rn   r   objectr   graphr   original_inputsrD   rX   r   
get_choicer^   nodesr   )rf   
ser_choicern   kernel_inputschoicebuffers         r*   autotunez#_DistributedAutotuneBuffer.autotune   s     	@\\!'';5 	*+BT-A-A+BCMdkk6222**4;;FF.!!##%	F fi000	 	 	s   BCC)rg   r]   rW   list[Buffer]rX   r   returnNone)rl   z
int | Nonerz   zdict[ChoiceCaller, float])rt   _SerializedChoicerz   r   )	r#   r$   r%   r&   r'   rd   re   rx   __classcell__)rh   s   @r*   rT   rT      sU     (( ( 	(
 
( "("	""r)   rT   c                p   t               }|sJ dg|j                         z  }t        j                  j	                  || |       t        d |D              }dg|z  }d}|D ]@  }|D ]9  }t        |t              sJ ||j                     J |||j                  <   |dz  }; B ||k(  sJ d| d|        |S )zT
    Perform the all_gather to collect the autotune results from all the ranks.
    N)groupc              3  2   K   | ]  }t        |        y wrj   )rI   ).0xs     r*   	<genexpr>z_sync.<locals>.<genexpr>   s     0SV0s   r   r	   zcount mismatch:  != )	r8   rK   rP   distributedall_gather_objectsumrD   r|   r-   )r?   rY   
all_states
node_countr@   check_countother_resultsrv   s           r*   r<   r<      s    
 "#K; 269I9I9K0KJ	''
4DK'X0Z00J150CK# # 	Ff&7888#FLL1999-3V\\*1K		 $V(8D&VV$r)   c                  L    e Zd ZdZddZd	dZed
d       Zedd       ZddZ	y)r|   z
    This is a serializer for the autotune choice. KernelTemplateChoice can't
    be serialized directly (the template and inputs prevent this) so we need to
    serialize it by parts and reconstruct later on.
    c                    || _         t        j                  |      | _        | j	                  |j
                        | _        y rj   )r-   r|   _template_uid_from_choicetemplate_uid_compute_kwargsdescriptionkwargs)rf   r-   rv   s      r*   rd   z_SerializedChoice.__init__   s4    
-GGO**6+=+=>r)   c                &   | j                         }i | j                  }d|v rF|j                         d   j                         d   }t	        j
                  ||d         |d   k(  |d<   i }ddlm}m}  ||      }	 |||	|||      }
|
j                  S )z=
        Deserialize the ChoiceCaller and return it.
        BLOCK_Kr   r	   EVEN_K)DictKernelTemplateParamsKernelTemplateChoice)
_template_from_uidr   rs   get_sizesympygcdkernel_template_choicer   r   rv   )rf   rX   rW   templater   kextra_kwargsr   r   paramsktcs              r*   rr   z_SerializedChoice.get_choice   s    
 **, DKK
 q!**,Q/A$yyF9,=>&BSSF8')	

 *&1"8V\66Rzzr)   c                j   | si S i }| j                  d      D ]  }|j                  dd      \  }}|j                         |j                         }}|dk(  rd||<   C|dk(  rd||<   N|j                         rt        |      ||<   m|j	                  d      r|j                  d      sJ |dd	 ||<    |S )
zI
        Given a template description turn it into input kwargs.
        ,=r	   TrueTFalseF')splitstripisdigitr    
startswithendswith)r   r   cfgkeyvals        r*   r   z!_SerializedChoice._compute_kwargs   s    
 I 46$$S) 	(Cyya(HCyy{CIIKCf}"s#s!#hs~~c*s||C/@@@!!Bis	( r)   c                   t        | t        j                        r<| j                  j                  dk(  ryt        d| j                  j                        t        | t        j                        ryt        dt        |              )z
        Given a ChoiceCaller figure out which template represents it. This
        is reversed by _template_from_uid().
        mmz!torch._inductor.kernel.mm.aten_mmzTODO: kernel z%torch._inductor.kernel.mm.mm_templatezTODO: )rD   r   ExternKernelCallerrv   rU   RuntimeErrorTritonTemplateCallertype)rv   s    r*   r   z+_SerializedChoice._template_uid_from_choice  sq     f.AAB}}!!T):"]6==3E3E2H#IJJ 0 E EF:V~677r)   c                    | j                   j                  d      }t               |d      }|dd D ]  }t        ||      } |S )z2
        See _template_uid_from_choice().
        .r   r	   N)r   r   globalsgetattr)rf   partsobjr   s       r*   r   z$_SerializedChoice._template_from_uid,  sO     !!'',ia!qr 	"A#q/C	"
r)   N)r-   r    rv   r   rz   r{   )rX   r   rW   r   rz   zChoiceCaller | None)r   r]   rz   z dict[str, Union[int, str, bool]])rv   r   rz   r]   )rz   r   )
r#   r$   r%   r&   rd   rr   staticmethodr   r   r   r(   r)   r*   r|   r|      s>    ?
4  0 8 8$r)   r|   c                >   g }| j                   D ]  }t        |t              s|j                  x}#t        |t              r4t        |t
              sE|j                  x}T|j                  x}c|j                  t              }|{|j                  sJ |j                         \  }}t        |j                  |      }	|j                  |	        t        j                   }
t#        |      |
j$                  k(  s!J dt#        |       d|
j$                   d       |S )zt
    Go through the nodes in the scheduler and autotune the kernels which
    should be autotuned by this rank.
    z'incorrect local autotuned nodes found (r   ))rs   rD   r   noderT   r   origin_noderN   getrO   r/   get_min_choicer|   r-   appendr   rJ   rI   r"   )r>   r?   r   
inner_noder   rN   info
min_choice_rv   rZ   s              r*   r;   r;   7  s.    13  ($.))#J,j"<=*&9:%111K:$$$D-xx12<zzz
 #113
A"4::z:'A (D ((E E$?$?? 
1#6F2G1HUMhMhLiijk? r)   c                   t        | j                        D ]  \  }}t        |t              st        |j                  x}t
              s4|j                  J |j                  j                  t           }|j                  ||j                           }|j                  }t        |t              sJ |j                  }t        |t              sJ |j                  |j                  k(  sJ | j                  ||||        y)zo
    Go through the nodes in the scheduler and autotune the nodes that were
    autotuned on remote ranks.
    N)	enumeraters   rD   r   r   rT   r   rN   rO   rx   r-   datar   r   rX   _replace_node)	r>   r@   ir   	dist_noder   out_tensorboxout_storage
out_buffers	            r*   r=   r=   j  s     Y__- D4dM*z))#Y&@0
 ((444((--.GHD%../?

/KLM',,Kk:666$))Jj/:::$$	(8(8888##J	1dCDr)   )rz   r   )r>   #torch._inductor.scheduler.Schedulerrz   r{   )rz   zGenerator[None, None, None])
rU   r]   rV   zlist[ChoiceCaller]rW   ry   rX   r   rz   z(TensorBox | ShapeAsConstantBuffer | None)r?   list[_SerializedChoice]rz   Sequence[_SerializedChoice])r>   r   rz   r   )r>   r   r@   r   rz   r{   ):
__future__r   
contextlibdataclassestypingr   r   r   unittest.mockr   r   torch._loggingrP   torch.distributedr   r3   torch.fxtorch.utils._ordered_setr    r
   r   rR   r   r   r   r   r   r   r   r   ru   r   r   r>   r   virtualizedr   r   collections.abcr   r   rO   r   r'   	dataclassr   r,   r8   rA   contextmanagerrG   r[   rT   r<   r|   r;   r=   r(   r)   r*   <module>r      sG   "   , ,       / &	 	 	 8 $ ' 3 3 )-& - # # #   
	8 8 8 
*4@JP-B4!4 4p8Z Zz0200fD2D1D 
Dr)   