
    qiB                         d dl Z d dlmZmZ d dlmZ d dlmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZmZmZ d dlmZ  e j*                  e      Zdeeef   fdZd	ej6                  defd
Z G d de      Z G d de      Zy)    N)abcdefaultdict)Iterable)AnyOptionaloverloadUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    t         j                  i dS )N)stagefound_inf_per_device)r   READY     p/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                     | j                   xs: | j                  j                  dddddt        j                  j                         fv S )Nxlacpuhpumtiaxpu)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer$      sH    >> V]]//..04  r   c                   4    e Zd ZdZdej
                  ddfdZy)_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    master_tensorr   Nc                 f    t        |      st        d|j                         || _        i | _        y )NExpected supported device, got )r$   AssertionErrorr   master_per_device_tensors)selfr'   s     r   __init__z&_GeneralMultiDeviceReplicator.__init__%   s9    #M2 1-2F2F1GH  $EG r   )__name__
__module____qualname____doc__r!   Tensorr.   r   r   r   r&   r&      s!    
Hell Ht Hr   r&   c                   j    e Zd ZdZddddddej
                  j                  fded	ed
edede	de
dee   ddf fdZedej                   dej                   fd       Zedeej                      deej                      fd       Zedeej                   df   deej                   df   fd       Zedeej                      deej                      fd       Zdeej                   eej                      f   deej                   eej                      f   fdZ	 d"dej,                  j.                  dej                   dej                   de
deej2                  ej                   f   f
dZdej,                  j.                  ddfdZdej                   ddfdZd#d eeeej                   f      ddfd!Z xZS )$ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    cudag      @g      ?g       @i  Tr   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc                     t         |   ||||||       | j                  r|| _        t	        t
              | _        y y )N)r7   r8   r9   r:   r;   )superr.   _enabledr<   r   r   _per_optimizer_states)	r-   r   r7   r8   r9   r:   r;   r<   	__class__s	           r   r.   zShardedGradScaler.__init___   sM     	!)'+ 	 	
 ==!.D)45Q)RD& r   outputsc                      y Nr   r-   rB   s     r   scalezShardedGradScaler.scaleu   s    <?r   c                      y rD   r   rE   s     r   rF   zShardedGradScaler.scalex   s    HKr   .c                      y rD   r   rE   s     r   rF   zShardedGradScaler.scale{   s    TWr   c                      y rD   r   rE   s     r   rF   zShardedGradScaler.scale~   s    PSr   c                 $     j                   s|S t        |t        j                        rt	        |      st        d|j                          j                   j                  |j                          j                  t        d      | j                  j                  |j                  d      z  }|j                  |j                        S g dt        t        j                  t        t        j                     f   f fd |      S )Nr)   +Expected _scale to be initialized, got NoneTr   non_blockingvalc                    t        | t        j                        rt        |       st	        d| j
                         t              dk(  rbj                  j                  | j
                         j                  t	        d      j                  t        j                               | d   j                  | j
                        z  }|j                  | j                        S t        | t        j                        r5t!        |       }t        | t"        t$        f      r t        |       |      S |S t'        d      )Nr)   r   rK   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer!   r3   r$   r*   r   len_scale_lazy_init_scale_growth_trackerappendr&   getr    dtyper   r   maplisttuple
ValueError)rN   
scaled_valiteratorapply_scaler-   stashs      r   r]   z,ShardedGradScaler.scale.<locals>.apply_scale   s   #u||,+C0(+J3::,)WXXu:?{{*<<SZZH{{*,I  LL!>t{{!KL 58<<

#;;
 "syy11#s||,{C0cD%=1$49X..QRRr   )r?   rP   r!   r3   r$   r*   r   rR   rS   tor    rV   r	   r   )r-   rB   scaled_outputr]   r^   s   `  @@r   rF   zShardedGradScaler.scale   s     }}Ngu||,'0$'Fw~~FV%WXX{{"44W^^D{{"$%RSS#dkknn~~D '5 ' M !%%gmm4457	SU5<<%,,1G#GH 	S0 7##r   	optimizer	inv_scale	found_inf
allow_fp16c           
         t        |      }t        |      }t        d       }t        j                         5  |j                  D ]9  }|d   D ]-  }	|	j
                  |s2|	j
                  j                  t        j                  k(  rt        d      |	j
                  j                  r|	j
                  j                  t        j                  u r[|	j
                  j                  t        j                        j                         }
|
j                  t        j                        |	_        |	j
                  j                         }n|	j
                  }||j                     |j                     j                  |       0 < |j!                         D ]O  \  }}|j#                         D ]7  }t        j$                  ||j'                  |      |j'                  |             9 Q 	 d d d        |j(                  s<| j*                  t-        d      |j'                  | j*                  j                         |j(                  S # 1 sw Y   ]xY w)Nc                       t        t              S rD   )r   rX   r   r   r   <lambda>z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s    T9J r   paramsz%Attempting to unscale FP16 gradients.rK   )r&   r   r!   no_gradparam_groupsgradrV   float16rZ   	is_sparser    float32coalesce_valuesr   rT   itemsvalues*_amp_foreach_non_finite_check_and_unscale_rU   r,   rR   r*   )r-   ra   rb   rc   rd   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler   per_dtype_gradsgradss                  r   _unscale_grads_z!ShardedGradScaler._unscale_grads_   s     =YG<YG &11J%K"]]_ 	"// )"8_ )Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
.z/@/@A"((fZ())). ,F+K+K+M ',335 EDD,008,0081	D $77{{"$%RSS $$T[[%7%78#777M	 	s   F,I  I	c                    | j                   sy | j                  d       | j                  t        |         }|d   t        j
                  u rt        d      |d   t        j                  u rt        d      | j                  t        d      | j                  j                         j                         j                         }t        j                  ddt        j                  | j                  j                         }| j#                  |||d	      |d
<   t        j
                  |d<   | j                  t        |         }g }g }g }|d
   j%                         D ]  }| j&                  dk7  r|j                   j(                  dk(  ro|j+                  |       |j-                  | j&                        }|j+                  |       |j+                  t/        j0                  |d	| j2                               |j+                  t/        j0                  |d	| j2                                |D ]  }	|	j5                           |rt        j6                  ||       y y )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().rK   )   g        )rV   r   Tr   r   )async_oprw   )r?   _check_scale_growth_trackerr@   idr   UNSCALEDRuntimeErrorSTEPPEDrR   r*   double
reciprocalfloatr!   fullrn   r   r}   rr   _devicer    rT   r_   dist
all_reducer<   wait_foreach_copy_)
r-   ra   optimizer_staterb   rc   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r   zShardedGradScaler.unscale_   s   }}((444R	]C7#x'8'88_  W%)9)99IJJ ;; !NOOKK&&(335;;=	JJ#U]]4;;3E3E
	 372F2Fy)T3
./ $,#4#4  44R	]C!()?@GGI 	I||u$)9)9)>)>%)G!((3&/ll4<<&@#$++,?@OO+d$BTBT OOIDDVDVW	  	DIIK	  !24HI r   c                    | j                   | j                  t        d      |j                         dk\  r;| xj                   | j                  z  c_         | j                  j                  d       y| j                  dz   }|| j                  k(  r;| xj                   | j                  z  c_         | j                  j                  d       y|| _        y)z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        Nz?Expected _scale and _growth_tracker to be initialized, got Noneg      ?r   r   )rR   _growth_trackerr*   item_backoff_factorfill__growth_interval_growth_factor)r-   rc   
successfuls      r   _amp_update_scale_cpu_z(ShardedGradScaler._amp_update_scale_cpu_   s    
 ;;$"6"6"> Q  >>s"KK4///K  &&q)--1JT222t222$$**1-'1$r   	new_scalec           	      "   | j                   sy| j                  d      \  }}|t        |t              r| j                  j                  |       nd}|j                  j                  | j                  k7  rt        |      |j                         dk7  rt        |      |j                  durt        |      | j                  j                  |       n!| j                  j                         D cg c]7  }|d   j                         D ]  }|j                  |j                  d      ! 9 }}}t!        |      d	k(  rt        d
      |d	   }t!        |      dkD  r"t#        dt!        |            D ]
  }	|||	   z  } |j                  j                  dk(  r| j%                  |       nLt'        j(                  | j                  | j*                  || j,                  | j.                  | j0                         t3        t4              | _        yc c}}w )a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdatezpnew_scale should be a float or a 1-element torch.cuda.FloatTensor or torch.FloatTensor with requires_grad=False.r   Fr   TrL   r   z,No inf checks were recorded prior to update.r   )r?   r   rP   r   rR   r   r   r    r   r*   numelrequires_gradcopy_r@   rr   r_   rQ   ranger   r!   _amp_update_scale_r   r   r   r   r   r   )
r-   r   rR   r   reasonstaterc   
found_infsfound_inf_combinedis
             r   r   zShardedGradScaler.update5  s   " }}"&"B"B8"L )U+!!),B  ##((DLL8(00??$)(00**%7(00!!), "77>>@!&'=!>!E!E!G  FMMEEJ  :!#$%STT!+A:"q#j/2 8A&*Q-7&8 }}!!U*++,>?((KK((&''(()) &11M%N"7s   2<H)TrD   )r/   r0   r1   r2   r   rw   WORLDstrr   intboolr   r   r.   r   r!   r3   rF   rX   rY   r   r	   optim	Optimizerdictr   r}   r   r   r   __classcell__)rA   s   @r   r5   r5   .   s=   .d # #"#04

0@0@SS S 	S
 S S S  -S 
S, ?U\\?ell? ?KT%,,/KD4FK KWU5<<#45W%c@Q:RW WSXell3S8NS S/$U\\8ELL+AAB/$	u||Xell33	4/$l  78;;((78 <<78 <<	78
 78 
ellELL(	)78r3J%++"7"7 3JD 3Jj2 2 2*DOuell/B)C D DOPT DOr   r5   ) loggingcollectionsr   r   collections.abcr   typingr   r   r   r	   r!   torch.distributeddistributedr   torch.amp.grad_scalerr
   r   r   "torch.distributed.distributed_c10dr   	getLoggerr/   loggerr   r   r   r3   r   r$   r&   r5   r   r   r   <module>r      s     ( $ 1 1    N N ; 
		8	$Ad38n A $ H$: HKO
 KOr   