
    qiSG                     X   d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
c mZ d dlm
c mc mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZmZmZm Z   e jB                  e"      Z# G d
 d      Z$	 d(dededee   de%de&e&e'df   e&e'df   f   f
dZ(ede'de'de'de ez  de'de%de&e'eejR                     f   fd       Z*edejR                  de'fd       Z+	 d(dedede,e'   dz  dee   de%de&e&e'df   e&e'df   f   fdZ-ej\                  j^                  Z0dejR                  dedee   de&e,e'   e,e'   f   fdZ1d ejd                  dedee   dejd                  fd!Z3d"ejh                  jj                  d#ee6   defd$Z7d%ededee   de&e'df   fd&Z8dejd                  fd'Z9y))    N)Sequence)AnycastOptional)	ShapeType)maybe_run_for_local_tensor)
DeviceMesh)redistribute_cost)DTensorSpec)_StridedShardPartial	Placement	ReplicateShardc                   n    e Zd ZdZ ej
                         ZddedefdZe	de
de
defd       Zd	 Zd
 Zy)ExplicitRedistributionContextaT  
    Within this context manager, DTensor will refuse to perform implicit redistribution,
    instead raising an error.  Manual calls to ``redistribute()`` are required wherever a redistribution
    must occur to avoid erroring.  This can be used to ensure that the user is aware of all redistribution.

    Note: it is easier to use this mode on just the forward pass of a typical DTensor program, as the backwards pass
    may contain implicit redistribution calls that are not visible to the user and difficult to replace with manual
    calls.  Redistribution during backward can be made explicit by writing `autograd.Function`s that are no-op
    during forward and perform a manual redistribution during backwards.

    enable (bool) if False, disables the context manager. Can be used nested inside an enabled region.

    strict (bool) if True, triggers on any redistribution.  If False, only triggers on redistributions that perform
    communication.

    mode (str) Determines what happens when ExplicitRedistributionContext triggers:
    "raise": raises an exceptoin, "warn" issues a warning
    enablestrictc                 X    || _         || _        |dvrt        d|       |dk(  | _        y )N)raisewarnzInvalid mode r   )_enable_strictRuntimeError_raise_on_redistribution)selfr   r   modes       e/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/distributed/tensor/_utils.py__init__z&ExplicitRedistributionContext.__init__0   s7    ((tf566(,%    src_specdst_specmessagec                     t        | j                  dd       x}r\d}|j                  r|j                  rd}nt	        ||      dk  }|s-|j
                  rt        |      t        j                  |       y y y )N_activeTFr   )	getattr_localr   r   r
   r   r   loggerwarning)clsr!   r"   r#   instancealloweds         r   observe_redistributionz4ExplicitRedistributionContext.observe_redistribution7   sv     szz9d;;8;G###G/(CqHG44&w//NN7+	  <r    c                 p    t        t        j                  dd       | _        | t        j                  _        | S )Nr%   )r&   r   r'   _prevr%   )r   s    r   	__enter__z'ExplicitRedistributionContext.__enter__H   s-    :AA9dS
7;%,,4r    c                 B    | j                   t        j                  _        y N)r/   r   r'   r%   )r   exc_typeexc_valexc_tbs       r   __exit__z&ExplicitRedistributionContext.__exit__M   s    7;zz%,,4r    N)TFr   )__name__
__module____qualname____doc__	threadinglocalr'   boolr   classmethodr   strr-   r0   r6    r    r   r   r      sa    & Y__F8t 8D 8 ,",.9,DG, , 
Br    r   global_shapemesh
placementsskip_offsetreturn.c                 P    t        | |j                  |j                         ||      S )a3  
    Compute the local tensor shape and the global offsets into the original tensor
    of a DTensor on its current global rank. This is useful for checkpointing purpose.

    Example:
    global_tensor = [[0,  1,  2,  3,  4], sharded on mesh (DP=2, TP=2) with (Shard(1), Shard(1))
                     [10, 11, 12, 13, 14]]

    This table shows the return value of local_shape and global_offset for each rank.
    (`local_tensor` is for illustration only).

    Note how the first coordinate of global_offset is always 0, corresponding to tensor dim 0 being replicated.

    Rank        local_tensor        local_shape     global_offset
    -------------------------------------------------------------
    0           [[0, 1],            (2, 2)          (0, 0)
                 [10, 11]]

    1           [[2],               (2, 1)          (0, 2)
                 [12]]

    2           [[3],               (2, 1)          (0, 3)
                 [13]]

    3           [[4],               (2, 1)          (0, 4)
                 [14]]

    Args:
        global_shape (ShapeType): The global shape of the DTensor.
        mesh (:class:`DeviceMesh`): The device mesh this DTensor is distributed on.
        placements (Sequence[:class:`Placement`]]): The placements of the DTensor.
        skip_offset (bool): If True, skip computing the global offsets and return an empty
            tuple for global_offset. This can improve performance when only the local shape
            is needed. Defaults to False.

    Return:
        local_shape: the shape of the DTensor's _local_tensor on the current rank.
        global_offset: a tuple of offsets for each dimension of the global tensor shape,
        identifying how this shard fits into the global tensor in each dimension. If
        skip_offset is True, this will be an empty tuple.

    )&_compute_local_shape_and_global_offsetshapeget_coordinate)rA   rB   rC   rD   s       r   %compute_local_shape_and_global_offsetrJ   Q   s*    ` 2djj$"5"5"7[ r    curr_local_sizemesh_dim_sizerank	placementzero_global_offsetc                    | ||d}t        |t              rd|d<    |j                  di |\  }}	|r|d fS |dk(  r|t        j                  ||dz         fS t        |t
              r<t        |t              s,t        |	t              sJ t        j                  |	|	|z         }
n't        |	t              sJ t        j                  |	      }
|||
fS |||
   fS )N)rK   
num_chunksrM   Freturn_first_offsetr      r@   )	
isinstancer   _local_shard_size_and_offsettorcharanger   intlisttensor)rK   rL   rM   rN   previous_offsetsrO   rD   kwargs
shard_sizeshard_offsetsindexs              r   _get_shard_size_and_offsetsr`      s     +#F
 )]+(-$% F	 F F P PJ4Q5<<(:<NQR<RSSS)U#Jy-,P----]MJ,FG-...]+5  +E222r    offsetsc                     t        | d         S )Nr   )rX   )ra   s    r   _get_first_offsetrc      s    wqz?r    
mesh_shapemy_coordinatec           
         d}|d|fS t        |       }i }t        |      D ]  \  }}	t        |	t        t        f      s|	j
                  }
| |
   }|
t        |      k  sJ d|
 dt        |              |j                  |
      }t        ||
   ||   ||   |	|||      \  }}|||
<   |||
<    |rt        |      |fS dgt        |       z  }|j                         D ]  \  }
}t        |      ||
<    t        |      t        |      fS )a>  
    Suppose you have a full tensor with size global_shape, and you have sharded
    it according to placements for mesh_shape.  This function returns, for a
    specific coordinate my_coordinate in the device mesh:

        - The size of your local shard WITHOUT padding (i.e., if you have
          an uneven split, your size might be smaller than the other entries
          in your dim), and

        - Where the data for your shard begins, in the full tensor.

    This function is fairly simple if your tensor is evenly sharded; the complication
    is around uneven splits.  There is also some complication for handling StridedShard,
    which changes the order you should apply sharding.

    Args:
        global_shape (ShapeType): The global shape of the tensor.
        mesh_shape (ShapeType): The shape of the device mesh.
        my_coordinate (Optional[list[int]]): The coordinate of the current rank in the device mesh.
        placements (Sequence[Placement]): The placements of the DTensor.
        skip_offset (bool): If True, skip computing the global offsets and return an empty
            tuple for global_offset. This can improve performance when only the local shape
            is needed. Defaults to False.

    Returns:
        tuple: A tuple containing:
            - local_shape (tuple[int, ...]): The shape of the local shard on the current rank.
            - global_offset (tuple[int, ...]): The offsets for each dimension identifying where
              this shard begins in the global tensor. If skip_offset is True, this will be an
              empty tuple.
    r@   )r   Sharding dim  greater than tensor ndim r   )rY   	enumeraterT   r   r   dimlengetr`   tupleitemsrc   )rA   rd   re   rC   rD   empty_offsetlocal_shapeshard_dim_to_global_offsetsmesh_dimrN   	shard_dimrO   r[   r]   r^   global_offsetglobal_offsetss                    r   rG   rG      sc   N Ll##|$K #%(4 ?))e]%;<MM	))43{++ 	
I;&@[AQ@RS	
+ 7::9E$?	"x (#%
!
M ",I1>#I.'?( [!<//C#l++M%@%F%F%H E!	>#4^#Di Eu]333r    global_tensorc           	         t        | j                               }t        | j                               }t        |      D ]  \  }}|j                  |      }|j	                         rt        t        |      }|j                  dk  rt        d|       |j                  }	|	t        |      k  sJ d|	 dt        |       d| d       ||	   }
|
|z  dk(  sJ d|
 d|        |
|z  ||	<   t        t        |            D ]"  }||	k7  s	||   ||	   |z  k\  s||   |z  ||<   $ t        |t        t        f      rt        d	t        |       d
       ||fS )ak  
    Compute the local size and stride of a DTensor from the given global tensor info.

    For example, if we have a global tensor with size (4, 8, 4) and stride (32, 1, 8).
    If the DTensor placements are [Shard(2)] and world_size is 2;
    then the local size is (4, 8, 2) and stride is (16, 1, 8).

    Args:
        tensor (:class:`torch.Tensor`):
            Global tensor which DTensor will distribute
        mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for the DTensor.
        placements (Sequence[:class:`Placement`]):
            The attribute of the DTensor that describes its layout
            on the mesh topology.

    Returns:
        local_shape: A List of int which specifies the size of the local tensor.
        local_stride: A List of int which specifies the stride of the local tensor.
    r   zOShard placements should have negative dims normalized in the user-facing APIs: rg   rh   z for placement number .zGlobal dim z not divisible by mesh size zplacement type z not supported!)rY   sizestrideri   is_shardr   r   rj   AssertionErrorrk   rangerT   r   r   r   type)rv   rB   rC   rp   local_strideidxrN   rL   shard_placementrs   global_dim_sizeis               r   compute_local_tensor_infor     s   4 }))+,K,,./L#J/ SY		#"5)4O""Q&$--<,=?  (++Is;// 	{*DSEUDV W((+uA//
 *)4O"]2a7 o..J=/Z7 &5%EK	" 3|,- GN$Q<	+B]+RR&21o&FLOG I	7';<i0AQRR=S@ $$r    rH   c                     t              dk7  rt        d      t              j                  k7  r%t        dt               dj                   d      t	        d   t
              r S t	        d   t              rt        fd       } |       }t        j                               D cg c]#  }t        j                  ||j                        % }}t        j                  ||       t         fd	       } |||      }t               }	||	d   j                   <   t        j"                  |	      S t        d
t%        d          d      c c}w )a  
    Compute the global size of a DTensor from the given local tensor shape,
    the mesh and placements. Different from `compute_global_tensor_info`,
    which assumes sharding is even, this util allgathers local shards' shapes
    from all ranks and thus can support uneven sharding.
    NOTE: Currently this function only supports 1D mesh.

    Args:
        shape (:class:`torch.Size`):
            Shape of the local tensor
        mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for the DTensor.
        placements (Sequence[:class:`Placement`]]):
            The attribute of the DTensor that describes its layout
            on the mesh topology.

    Return:
        tensor_shape: Shape of the global DTensor.
    rS   z>compute_global_tensor_shape only supports 1 placement for now.z/Expected one placement per mesh dim, but found z placements and z mesh dims.r   c                 X    t        j                  t        |       j                        S )Ndevice)rV   rZ   rY   device_type)rH   rB   s    r   _create_local_shape_tensorz?compute_global_tensor_shape.<locals>._create_local_shape_tensorm  s    <<UD4D4DEEr    r   c                    d}d   j                   }t        t        	            D cg c]
  }||k7  s	| }}|D ]A  }t        j                  | |   ||         st        d      |j                         }|||   z  }C |S c c}w )Nr   z?Non-sharded dimensions should have identical size across ranks.)rj   r}   rk   rV   equalr   tolist)
rp   gathered_shaped_tensorssharded_dim_sumrs   d
other_dimsshape_tensorshape_tensor_listrC   rH   s
           r   "_validate_and_compute_global_shapezGcompute_global_tensor_shape.<locals>._validate_and_compute_global_shapex  s    O"1))I%*3u:%6I!y.!IJI 7 @{{;z#:L<TU&Y  %1$7$7$9!#4Y#??@ #" Js
   
BBzPlacement type z not supported.)rk   NotImplementedErrorndimr   rT   r   r   r   r}   ry   rV   
empty_liker   funcolall_gather_inplacerY   rj   Sizer~   )
rH   rB   rC   r   rp   _r   r   r   rA   s
   ```       r   compute_global_tensor_shaper   G  sr   . :!!L
 	
 :$))#Z))9$))KQ
 	

 *Q-+	JqM5	)	#	F 
$	F 17 499;'#
 [1C1CD#
 #
 	!!"9;M	#	# 
$	# =0
 E{*9Z]&&'zz,''!d:a=12/B
 	
7#
s   3(Eop_callargsc                 >   |D ]  }t        |t        j                  t        f      r|j                  c S t        |t
        t        f      sHt        |      dkD  sWt        |d   t        j                  t        f      s{|d   j                  c S  t        d|  d      )z
    Find the device mesh object from args.
    It returns None if no mesh is found.
    NOTE: we can optimize this search if needed
    r   z+Cannot find device mesh from args for op : rx   )	rT   dtensorDTensorr   device_meshrY   rm   rk   
ValueError)r   r   args      r   try_find_mesh_from_argsr     s      &cGOO[9:??"sT5M*C13q6GOO[#ABq6%%%& B7)1M
NNr    global_stridec                 p    dgt               z  t        |      D ]q  \  }}|j                         st        t        |      j
                  }t        t                     D ]*  } |    |   kD  s|xx   |j                  |      z  cc<   , s t         fdt        t                     D              S )z
    Compute the stride of a local tensor shard, given the global stride of the DTensor.
    NOTE: Currently this function is assuming the DTensor is evenly shardable.
    rS   c              3   4   K   | ]  }|   |   z    y wr2   r@   ).0r   r   stride_divisorss     r   	<genexpr>z'compute_local_stride.<locals>.<genexpr>  s%      34aOA..s   )	rk   ri   r{   r   r   rj   r}   ry   rm   )r   rB   rC   mesh_idxpr   jr   s   `      @r   compute_local_strider     s     cC..O , >!::<UA""A 3}-. > #mA&66#A&$))H*==&>>  8=c->P8Q  r    c                    t        | t        j                        r| S t        | t              r| g}n;t	        |       dk(  r"t        | d   t
              rt        | d         }nt        |       }t        j                  |      S )z
    Unify variable types of size argument to torch.Size
    Acceptable types include:
        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
        or torch.Size
    rS   r   )rT   rV   r   rX   rk   r   rY   )ry   
torch_sizes     r   normalize_to_torch_sizer     se     $

#$V
	TaJtAw9$q']
$Z
::j!!r    )F):loggingr;   collections.abcr   typingr   r   r   rV   )torch.distributed._functional_collectivesdistributed_functional_collectivesr   torch.distributed.tensor._apirZ   _apir   torch._prims_commonr   torch.distributed._local_tensorr   torch.distributed.device_meshr	   *torch.distributed.tensor._collective_utilsr
   &torch.distributed.tensor._dtensor_specr   (torch.distributed.tensor.placement_typesr   r   r   r   r   	getLoggerr7   r(   r   r=   rm   rX   rJ   Tensorr`   rc   rY   rG   _C#_DTensor_compute_global_tensor_infocompute_global_tensor_infor   r   r   _ops
OpOverloadobjectr   r   r   r@   r    r   <module>r      s     $ & &  : : / / ) F 4 H >  
		8	$4B 4Bv 	22
2 #2 	2
 5c?E#s(O+,2j 333 3 }$	3 3 3 3&&'3 3B u||    S4S4S4 9t#S4 #	S4
 S4 5c?E#s(O+,S4l #XXII =%<<=%
=% #=% 49d3i 	=%@H
::H
'H
5=i5HH

ZZH
VOZZ""O*26*:OO*$.<DY<O
38_,"UZZ "r    