
    qiA              	          d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ d dl	Z	d dl
Z
d dlmc mZ d dlmZ d dlmZ ddlmZ ddlmZmZmZ dd	lmZ  ej4                  e      Z G d
 de      Z G d de      Ze j>                  defd       Z de!defdZ"dejF                  defdZ$d4de
jJ                  de&de&fdZ'd4de
jJ                  de&de&fdZ(dejF                  de&fdZ)dejF                  de&fdZ* G d de      Z+ G d de      Z, G d de      Z-d gd!ggZ.d"gd"ggd#gd#ggd$gd%gggZ/g d&g d'g d'gZ0dee1   fd(Z2d)e&d*e&d+ede1fd,Z3dejF                  de1fd-Z4d.e
jj                  jl                  de&fd/Z7d.e
jj                  jl                  de&fd0Z8	 	 d5d.e
jj                  jl                  d1ee&   d2e9de1fd3Z:y)6    N)IntEnum)AnyOptional)hint_int)normalize_function   )ir)get_dtype_sizesnode_args_kwargssympy_product)Vc                        e Zd ZdZdZdZdZdZy)	NCCL_COLLr   r            N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER
ALL_TO_ALLUNSUPPORTED     c/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/_inductor/comm_analysis.pyr   r      s    JJNJKr   r   c                       e Zd ZdZdZdZy)NVIDIA_GPU_TYPEr   r   r   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      s    EFFr   r   returnc                  8   t         j                  j                  j                  t         j                  j                  j                        xs d} d| v rt
        j                  S d| v rt
        j                  S d| v rt
        j                  S t
        j                  S )N V100A100H100)	torchutilscollect_envget_gpu_inforunr   r    r!   r"   )gpu_infos    r   get_gpu_typer/   %   s|    {{&&33EKK4K4K4O4OPVTVH$$$	8	%%%	8	%%% %%%r   kernel_namec                       J d v rt         j                  S d v rt         j                  S d v rt         j                  S t	         fddD              rt         j
                  S t         j                  S )N
all_reduce
all_gatherreduce_scatterc              3   &   K   | ]  }|v  
 y wNr   ).0commr0   s     r   	<genexpr>z7get_collective_type_from_kernel_name.<locals>.<genexpr>;   s     HTT[ Hs   )
all_to_allalltoall)r   r   r   r   anyr   r   )r0   s   `r   $get_collective_type_from_kernel_namer=   3   sr    """{"###		$###	[	('''	H-GH	H###$$$r   nodec                     t        | t        j                        st        d|        | j                  }|J t        |      S )Nz!node is not a collective kernel: )
isinstancer	   _CollectiveKernel
ValueErrorpython_kernel_namer=   )r>   names     r   get_collective_typerE   A   sG    dB001<TFCDD""D/55r   sizefallbackc                     t        |       }t        |t        j                        rt	        |      S t
        j                  j                  j                  ||      S )NrG   )	r   r@   sympyIntegerintr   graphsizevars	size_hint)rF   rG   numels      r   get_ir_node_size_numelrQ   J   sD    $E%'5z77%%eh%??r   c                 j    t        j                  t        j                  | d      }t	        ||      }|S )Nr   rI   )	functoolsreduceoperatormulr   )rF   rG   rP   results       r   get_fx_node_size_numelrX   Q   s+    X\\43Eeh/FMr   c                     d}| j                   D ]F  }t        |j                  j                        }||t	        |j                  j
                        z  z  }H |S )Nr   )inputsrQ   layoutrF   r
   dtype)r>   sz_bytesinprP   s       r   get_collective_input_size_bytesr_   W   sQ    H{{ =&szz7EN3::+;+;<<<= Or   c                     t        | t        j                        r5t        | t        j                        sddlm}  || j                  d         S t        d|        )Nr   _get_group_size_by_namezUnsupported collective type: )r@   r	   rA   _WaitKernel"torch.distributed.distributed_c10drb   constant_args	TypeError)r>   rb   s     r   get_collective_group_sizerh   _   sK    $,,-jr~~6VN&t'9'9"'=>>7v>??r   c                       e Zd ZdZdZdZy)NCCL_HWr   r   r   N)r   r   r   NVLINKPCINETr   r   r   rj   rj   m   s    F
C
Cr   rj   c                       e Zd ZdZdZy)	NCCL_ALGOr   r   N)r   r   r   TREERINGr   r   r   ro   ro   s   s    DDr   ro   c                       e Zd ZdZy)
NCCL_PROTOr   N)r   r   r   LLr   r   r   rs   rs   x   s	     
Br   rs   g333333@gffffff@g333333?      ?g      @g@)     C@rv   gffffff4@)gU@g     6@g      3@c                 H   | j                   }|J t        |dd      }|j                  d   }ddlm}  ||      }t
        j                  j                  |      }t        j                  d|       }t        |      }t        |       \  }	}
d|v r|	dd  |	d   z   }	t
        j                  j                  ||	      5 } ||	i |
}t
        j                  j                  j                  j                  |       d d d        j                   }|dk  ry |d
z  }|S # 1 sw Y   "xY w)NrC   r%   rc   r   )_resolve_process_groupzcuda:all_gather_into_tensor_outr   )groupdevice     @@)r>   getattrrf   re   rx   r)   distributedget_rankr{   evalr   _time_estimatorops_c10d_functionalwait_tensordefaultestimated_time)snodekernelpy_kernel_namepg_namerx   pgrankr{   fnargskwargstime_estimatorwest_time_usest_time_mss                  r   /estimate_nccl_collective_runtime_nccl_estimatorr      s+   ZZFV%92>N""2&GI		(B!!**2.D \\E$.)F	n	B$U+LD& $~5ABx$q'!				*	*F	*	C :~		""..66q9: !//K Q#K: :s   ;<DD!tensor_storage_size_bytes
group_sizecollc                    | dz  dz  dz  }d}t        j                  ||z        }|}|dk  ryt        j                  }t        j
                  }t        j                  j                  j                  }	t        j                  j                  j                  }
t               }|dk  r|dz
  nd}|dk(  r|nd}t        |   |   }|dk(  r|	n|
}d}||z  }t        |||dkD  s|t        j                  k(  rdndz        }|t        j                  k(  r	d|dz
  z  }nC|t        j                   k(  r	d|dz
  z  }n'|t        j"                  t        j$                  fv r|dz
  }d|z  z  }||z  }|d	z  }t&        j(                  }|t        j                  k(  r|dkD  rd|z  }n9d}n6|t        j"                  t        j$                  t        j                   fv r|dz
  }t*        |   |   }t,        |   |   |   }t,        t&        j.                     |   |   }d
}|dkD  rd}t1        ||      }||z
  |z  ||z  z   z  }|dz  }||z  }||z   }|dz  }|S )a:  
    Returns estimated NCCL collective runtime in milliseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r   g      ?gUUUUUU?ru   g    eAg        r|   g    .A)mathceilro   rq   rs   rt   r)   	_inductorconfigintra_node_bwinter_node_bwr/   llMaxBwsminr   r   r   r   r   rj   rk   baseLathwLatrm   max) r   r   r   tensor_storage_size_GBnum_gpus_per_nodenNodesnRanks	nccl_algo
nccl_protobwIntrabwIntercompCapIndexindex2index1llMaxBwbw	nChannelsbusBwnstepsratio	bandwidthbandwidth_GB_per_nsintraHwnInterStepslatencyintraLatinterLatnetOverhead
latency_nstransport_nsnsmss                                    r   %estimate_nccl_collective_runtime_implr      s     7=DtK YYz$556FF{ IJ
 oo$$22Goo$$22G>L!Q;VaZAF#q[\aFvv&G aKWBINE !ty/C/C'C9)	UE y###fqj!	%%	%fqj!	)**I,@,@A	A! 6\V#EI#c/ nnGy###A:f*KK	)**I,@,@)BVBVW	Wqj i ,GW~i(4HW[[!),Z8H Kz8[)H$0;3IIIG3J *,??L	
	"B	cBIr   c                 ^    t        |       }t        |       }t        |       }t        |||      S )9  
    Returns estimated NCCL collective runtime in nanoseconds (ms).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    )r_   rh   rE   r   )r>   r   r   r   s       r    estimate_nccl_collective_runtimer   I  s8     !@ E*40Jt$D0!:t r   fx_nodec                    d| j                   | j                  }}t        |      }|j                  dd       dt        j
                  dt        fddt        j                  j                  ffd}t        j                  t        j                  j                  |||f       | j                  j                  dd      }t        |t        j
                        sy	 |      }|z   S )
zSEstimate the size of a collective operation in bytes, including inputs and outputs.Nouttr#   c                 `    t        | j                               t        | j                        z  S r6   )rX   rF   r
   r\   )r   s    r   tensor_bytesz1estimate_fx_collective_size.<locals>.tensor_bytesh  s!    %affh/.2IIIr   r^   c                     | j                   j                  dd       }t        |t        j                        sy d |      z  y )Nvalr   )metagetr@   r)   Tensor)r^   inp_valinput_bytesr   s     r   add_inp_bytesz2estimate_fx_collective_size.<locals>.add_inp_bytesk  sD    ((,,ud+'5<<0 K|G,,r   r   r   )r   r   dictpopr)   r   rL   fxNodepytreetree_map_onlyr   r   r@   )r   r   r   r   
output_valoutput_bytesr   r   s         @@r   estimate_fx_collective_sizer   ^  s    K<<&D&\F JJudJ J J-588== - 	v !!%.J*Z"F
+L%%r   c                 B    ddl m} t        |       } ||       s|S |dz  S )zEstimate the memory footprint of a collective operation in bytes.

    This returns the total bytes that need to be live concurrently in memory.
    For all_reduce, we divide by 2 since it can be done in-place.
    r   )is_all_reduce_tensorr   )#torch._inductor.fx_passes.bucketingr   r   )r   is_all_reducerF   s      r   'estimate_fx_collective_memory_footprintr     s)     'w/D$W-4<419<r   override_sizeuse_nccl_estimatorc                    
 ddl m} t               }n}t         j                  t
              rJ t         j                   j                   j                  d      }|J |\  
d    |      }t         j                  t        j                  j                        sJ t         j                  j                               }dt        t           f
 fd}|r |       }	|	|	S t!        |||      S )r   r   ra   T)r   r   normalize_to_only_use_kwargs
group_namer#   c                  d   ddl m} m}  |      }t        j                  j
                  j                  |      dk(  ry  | |      }|j                  |      }|j                  sy t        j                  f      \  }}dt        j                  ffddt        j                  dt        fd}dt        dt        ffd	|D cg c]
  } |       }}t        j                   ||      \  }	}
j"                  }t%        |t        j&                  j(                        sJ t        j                  j+                  |
      5 } ||	i |
}t        j,                  j.                  j0                  j3                  |       d d d        j4                  }|dk  ry |dz  }|S c c}w # 1 sw Y   'xY w)Nr   )_get_pg_default_devicerx   faker#   c                 >    t        j                  | ng||      S )N)r\   r{   )r)   empty)rF   r\   r{   r   s      r   _tensorzVestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>._tensor  s&    ;;%-M? r   sc                 X    t         j                  j                  j                  | d      S )Nr   rI   )r   rM   rN   rO   )r   s    r   try_size_hintz\estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.try_size_hint  s"    77##--a!-<<r   ec                    t        | t        j                  j                        r | j                  d         S t        | t        j
                        r6 t        | j                               g| j                  | j                        S | S )Nr   )
r@   r)   r   r   r   r   rX   rF   r\   r{   )r   r   to_real_tensors    r   r   z]estimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate.<locals>.to_real_tensor  sa    !UXX]]+%affUm44!U\\* 6qvvx @A177AHHUUHr   )rz   r|   )re   r   rx   r)   r~   distributed_c10dget_backend_get_backendsupports_time_estimater   tree_flattenr   rJ   ExprrL   r   tree_unflattentargetr@   _ops
OpOverloadr   r   r   r   r   r   )r   rx   r   r{   backend	flat_argsflat_args_pytree_specr   a	real_argsreal_kwargsr   r   r   r   r   r   r   r   r   r   r   r   s                   @@r   _nccl_estimatezEestimate_nccl_collective_runtime_from_fx_node.<locals>._nccl_estimate  s   	

 $J/--99"=G'+//&)--+1+>+>f~+N(	(	ELL 		=UZZ 	=C 	=	c 	c 	 1::1^A&:	:!'!6!6yBW!X	;^^"ejj33444..R.8 	>NI--AII&&22::1=	> %33 ?!C' ;
	> 	>s   F!<F&&F/)re   rb   r   r@   r   strr   r   r   r)   r   r   r=   rD   r   floatr   )r   r   r   rb   r   opt_args_kwargsr   r   r  r   r   r   r   s   ``        @@@r   -estimate_nccl_collective_runtime_from_fx_noder    s    " K$?$H!$1!'..#...(\\~~%)	O &&&"LD&%J(4Jgnnejj&;&;<<</0C0C0EFD2HUO 2 2h $&"0!:t r   )i   )NT);rS   loggingr   rU   enumr   typingr   r   rJ   r)   torch.utils._pytreer*   _pytreer   %torch.fx.experimental.symbolic_shapesr   torch.fx.operator_schemasr   r%   r	   r
   r   r   virtualizedr   	getLoggerr   logr   r   	lru_cacher/   r  r=   IRNoderE   SizerL   rQ   rX   r_   rh   rj   ro   rs   r   r   r   r  r   r   r   r   r   r   r   boolr  r   r   r   <module>r     sx            $ $ : 8  C C  g! g  
&o 
& 
&%c %i %6bii 6I 6@ @s @S @ s S "))  @BII @# @g  
  	
 		  
	 
	 
		,,huo Be"e03e;De
eZ299  *$& $&3 $&N=UXX]] =s =  $(#bXX]]bC=b b 	br   