
    ti=              	       (   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
mZ d dl mZ d dlmZ d dlmZmZ d dlmZ d dlmZmZmZmZmZ d d	lmZ d dlZd dlmZ d dlm Z  d dl!m c m"Z# d d
l$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z, d dl-m.Z. d dl/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z;m<Z<m=Z= d dl>m?Z?m@Z@mAZA d dlBmCZCmDZDmEZEmFZF d dlmGZGmHZH d dlImJZK d dlLmMZMmNZNmOZOmPZP d dlQmRZRmSZSmTZTmUZUmVZVmWZW d dlXmYZY dZZeUrdZ[dZ\ej                  j                         ZZn.eVrdZ[dZ\n'eWrdZ[d Z\ej                  j                         ZZnd!Z[d"Z\d#ZZ G d$ d%e      Z` G d& d'e      Za G d( d)e j                  e      Zcd*e j                  d+ej                  d,efd-Zed. Zf	 	 dzd*e j                  d/egfd0Zhd{d1Zid2 Zjd3 Zkd|d*e j                  d4egfd5Zld*e j                  d6egfd7Zmd*e j                  d8egfd9Zn G d: d;      Zo G d< d=ec      Zp G d> d?ec      Zq G d@ dAeq      Zr G dB dCeq      Zs G dD dEec      Zt G dF dGet      Zu G dH dIe j                        Zv G dJ dKeq      Zw G dL dMe j                        Zx G dN dOe j                        Zz G dP dQe j                        Z{e j                  dRefdS       Z}e j                  dTefdU       Z~e j                  dVefdW       Ze j                  dXefdY       Ze j                  dZefd[       Zee j                  d\efd]              Zee j                  d^efd_              Zee j                  d`efda              Zee j                  dbefdc              Zdded,edeedfefdgZ	 d}dhe j                  die j                  djeedkf   fdlZ ej                  eWdm       G dn doeN             Z G dp dqeM      Zd~dree   fdsZ G dt due j                        Z G dv dwe j                        Z G dx dye j                        Zy)    N)ABCabstractmethod)Callable)nullcontext)deepcopy)autoEnumwraps)Anycastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_msset_rng_seed	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudancclzhpu:0hcclxpuxcclcpugloo   c                   (    e Zd Z e       Z e       Zy)FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE     i/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/testing/_internal/common_fsdp.pyrA   rA   Y   s    fGIrH   rA   c                   6    e Zd Z e       Z e       Z e       Zy)DEVICEInitModeN)rB   rC   rD   r   DEVICE_BEFOREDEVICE_AFTERDEVICE_NEVERrG   rH   rI   rK   rK   b   s    FM6L6LrH   rK   c                       e Zd ZdZedeej                  df   fd       Zedej                  fd       Z	edd       Z
eeded	edej                  fd
              Zy)FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                      y)z+Returns an input for the model as as tuple.NrG   selfdevices     rI   	get_inputzFSDPTestModel.get_inputo        	rH   c                      y)z,Returns the loss given the input and output.NrG   )rT   inputoutputs      rI   get_losszFSDPTestModel.get_losst   rW   rH   Nc                      y)z<Runs the backward pass (e.g. including ``loss.backward()``).NrG   rT   losss     rI   run_backwardzFSDPTestModel.run_backwardy   rW   rH   argskwargsc                       y)z&Initializes an instance of this model.NrG   )r`   ra   s     rI   initzFSDPTestModel.init~   s     	rH   rQ   N)rB   rC   rD   __doc__r   tupletorchTensorrV   r[   r_   staticmethodr   nnModulerc   rG   rH   rI   rP   rP   k   s     5s):#;        C 3 299   rH   rP   modelprocess_group	assert_fnc                 0   | j                         D cg c]%  \  }}||j                         j                         f' }}}|| j                         D cg c]%  \  }}||j                         j                         f' c}}z  }t	        j
                  |      }t        |      D 	cg c]  }	d }
}	t	        j                  |
||       |
d   }|J |
dd D ]+  }|J t        ||d      D ]  \  \  }	}\  }	} |||        - yc c}}w c c}}w c c}	w )a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    Ngroupr   r?   Tstrict)	named_parametersdetachr=   named_buffersdistget_world_sizerangeall_gather_objectzip)rl   rm   rn   
param_nameparamnamed_module_statesbuffer_namebuffer
world_size_olistrank0_statesstatep1p2s                  rI   _assert_module_statesr      s8    "'!7!7!9J 
U\\^'')*  #(#6#6#8K 
fmmo))+,  $$]3J ,-aT-E-5"5]K8L###qr     #L% E 	GQWab"	
 .s   *D*D'	Dc                  4    t        j                  t              S N)rg   rU   DEVICE_TYPErG   rH   rI   get_devtyper      s    <<$$rH   zero_buffersc                    |rt        j                  |       n	t               }|5  | j                         D ]/  }t	        j
                         5  |j                          ddd       1 |rB| j                         D ]/  }t	        j
                         5  |j                          ddd       1 ddd       y# 1 sw Y   xY w# 1 sw Y   PxY w# 1 sw Y   yxY w)zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersrg   no_gradzero_buffers)rl   r   summon_fullctxr}   r   s         rI   _zero_modelr      s     -8$
!
!%
([]C	 #%%' 	E  	 --/ #]]_ #LLN# ### # # ## #s;   (CB43CC !
C4B=9C C	CCc                 t    |s| j                  t              } |r| j                          | j                         S r   )tor   half
state_dict)rl   cpu_offloadr   s      rI   _get_state_dictr      s.    %

rH   c           	      j    dj                  |D cg c]  }|| t        |         nd c}      S c c}w )Nr   none)joinstr)test_name_mappingr`   ss      rI   subtest_namer      s7    88IMNAam	3q6	"	?N Ns   0c                 x   |j                         D ];  \  }}|j                  t        j                  d      k7  s)|j                         ||<   = | dk(  r|nd g}t	        j
                  |       t        t        t        t        j                  f   |d         }|D ]  }||   j                  t              ||<    |S )Nr=   r   )itemsrU   rg   r=   rw   broadcast_object_listr   dictr   rh   r   r   )rankr   r|   r}   r   s        rI   _broadcast_state_dictr      s     (--/ 1
E<<5<<..%*YY[Jz"1  19Z$/Eu%d3,-uQx8J  H
!+J!7!:!:;!G
:HrH   recursec                     t        j                  | |      5  t        t        | j	                                     cddd       S # 1 sw Y   yxY w)a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r   r   r   listr   )rl   r   s     rI   get_full_paramsr      s?     
	 	 	8 2U--/012 2 2s   "AAmove_to_devicec                 4    |r| j                  t              S | S r   )r   r   )rl   r   s     rI   _move_to_devicer      s    $2588K ==rH   	wrap_fsdpc                 (    |s| S t        | g|i |S r   r   )rl   r   r`   ra   s       rI   _maybe_wrap_fsdpr      s    !5CtE'CD'CF'CCrH   c                   :    e Zd ZdedefdZdefdZdefdZd Zy)	DummyProcessGroupr   sizec                      || _         || _        y r   )_rank_size)rT   r   r   s      rI   __init__zDummyProcessGroup.__init__   s    

rH   rQ   c                     | j                   S r   )r   rT   s    rI   r   zDummyProcessGroup.rank       zzrH   c                     | j                   S r   )r   r   s    rI   r   zDummyProcessGroup.size   r   rH   c                 B    t        j                         }d }||_        |S )Nc                  d    t         j                  j                         } | j                  d       | S )Nr?   )rg   futuresFuture
set_result)futures    rI   
get_futurez/DummyProcessGroup.allreduce.<locals>.get_future   s'    +0==+?+?+AFa MrH   )r   Mockr   )rT   r`   ra   	dist_waitr   s        rI   	allreducezDummyProcessGroup.allreduce   s"    IIK		
  *	rH   N)rB   rC   rD   intr   r   r   r   rG   rH   rI   r   r      s2    S  c c 	rH   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 	 ddej                  d
ededeeeef      dededeej(                  ef   fd       Zd Z xZS )TransformerWithSharedParamsrq   device_init_modeadd_bndeterministicc                    t         |           |j                         | _        |j                         | _        |rt        j                  d       d}d}t        j                  ||      | _	        t        j                  |dddd      | _        t        j                  ||      | _        | j                  j                  | j                  _        | j                  d| j                  j                  j!                  |f             | j                  d	t        j"                  | j$                  t
        j&                  
             d| _        |r)t
        j                  j+                  | j(                        nt
        j                  j-                         | _        |t0        j2                  k(  r| j5                  t6              } |r| j9                          y y )Nr               g?)d_modelnum_encoder_layersnum_decoder_layersdim_feedforwarddropout
vocab_biaslong_buffer)dtype)superr   r   r   r   rg   manual_seedrj   	Embeddingembed_tokensTransformertransformerLinearoutput_projweightregister_buffernew_ones
zeros_liker   longbsBatchNorm1dIdentitybnrK   rL   r   r   eval)rT   rq   r   r   r   d_vocabr   	__class__s          rI   r   z$TransformerWithSharedParams.__init__  sa    	JJL	**,a LL':>>  
 99Wg6 #'"3"3":":$++22;;WJG	
 	T__EJJ?	

 39%((&&tww/uxx?P?P?R~;;;77;'DIIK rH   c                 ,   t        j                  d| j                  z          t        j                  d|      j	                  d| j
                        }t        j                  | j
                  dz  |      j	                  d| j
                        }||fS )Nr?      rU      r7   )rg   r   r   arangeviewr   )rT   rU   srctgts       rI   rV   z%TransformerWithSharedParams.get_input1  sl    !dii-(ll2f-221dgg>ll477Q;v6;;AtwwGSzrH   c                    | j                  |      }|| j                  z   | j                  j                  |      z   }| j                  |      }| j	                  |      }| j                  ||      }| j                  |      S r   )r   r   r   type_asr   r   r   )rT   src_idstgt_idsr   r   xs         rI   forwardz#TransformerWithSharedParams.forward7  sv    (DOO#d&6&6&>&>s&CC(ggclS#&""rH   c                     |\  }}t         j                  j                  |j                  d|j	                  d            |j                  d      d      S )Nsum)	reduction)rj   
functionalcross_entropyr   r   )rT   rY   rZ   r   r   s        rI   r[   z$TransformerWithSharedParams.get_loss?  sI    3}}**KKFKKO,chhrle + 
 	
rH   c                 $    |j                          y r   backwardr]   s     rI   r_   z(TransformerWithSharedParams.run_backwardE      rH   fsdp_init_modefsdp_kwargsrQ   c                 N   |i }|t         j                  k(  r&t        | t              r| d   }n| }t	        ||||      S |t         j
                  k(  rd|vrt        t        t        h      }n|j                  d      }d|v r8|d   t        j                  t        j                  hv rt        | t              sd}n| }t        | t              r| d   }	n| }	t	        |	|||      }
t        |
|fd|i|}|t        j                  k(  r|j!                  t"              }|S t%        d|       )au  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )rA   rE   
isinstancerf   r   rF   r    r*   r)   popr   HYBRID_SHARD_HYBRID_SHARD_ZERO2r   rK   rM   r   r   
ValueError)rq   r
  r   r  r   r   pgr  fsdp_pg
tformer_pgm
fsdp_models               rI   rc   z TransformerWithSharedParams.initH  sX   6 K\111%'1X.$fm  |555!4#3//$  $/??3E#F  ${2 34$113C3W3WXY"5%0%'"1X
"
+,fmA  "2 	J  >#>#>>']];7
77GHIIrH   c                     | j                   gS r   )r   r   s    rI   get_ignored_modulesz/TransformerWithSharedParams.get_ignored_modules  s      !!rH   )NFT)rB   rC   rD   rw   ProcessGrouprK   boolr   rV   r   r[   r_   ri   rA   r   r   r   r   r   rj   rk   r   rc   r  __classcell__r   s   @rI   r   r     s    (  ( )( 	(
 (T#
 
 15#KJ  KJ$KJ )KJ d38n-	KJ
 KJ KJ 
ryy$	KJ KJZ"rH   r   c                        e Zd Zdej                  dededef fdZd Zd Z	d Z
d	 Ze	 	 ddej                  d
ededeeeef      dedej&                  fd       Z xZS )NestedWrappedModulerq   r   r   r   c                    t         |           j                         | _        j                         | _        |t
        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        t        j                  dd      |            t        t        j                  dd      |            | _        y )Nc                 &    rt        | fi S | S r   r   layerr  rq   r   s    rI   _maybe_wrapz1NestedWrappedModule.__init__.<locals>._maybe_wrap      E58K88LrH   r   r   r7   r   )r   r   r   r   r   rK   rL   rg   r   rj   
Sequentialr   r   module	rT   rq   r   r   r   r  r   r&  r   s	    ``  `  rI   r   zNestedWrappedModule.__init__  s     	JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F 		"a(8.IJBIIaO^<

rH   c                 x    t        j                  d| j                  z          t        j                  dd|      fS )Nr?   r7   r   r   )rg   r   r   randrS   s     rI   rV   zNestedWrappedModule.get_input  s.    !dii-(

1a/11rH   c                 $    | j                  |      S r   r)  rT   r   s     rI   r   zNestedWrappedModule.forward      {{1~rH   c                 &    |j                         }|S r   )r  rT   rY   rZ   r^   s       rI   r[   zNestedWrappedModule.get_loss  s    zz|rH   c                 $    |j                          y r   r  r]   s     rI   r_   z NestedWrappedModule.run_backward  r	  rH   r
  r  rQ   c                    |i }|t         j                  k(  rt        | d||      S |t         j                  k(  r:t        | fd||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        Fr   r   r   Tr  )	rA   rE   r!  rF   rK   rM   r   r   r  )rq   r
  r   r  r   r  s         rI   rc   zNestedWrappedModule.init  s    . K\111&!1+	  |555,!1+	
 J  >#>#>>']];7
77GHIIrH   NF)rB   rC   rD   rw   r  r  rK   r   rV   r   r[   r_   ri   rA   r   r   r   r   rj   rk   rc   r  r  s   @rI   r!  r!    s    
  
 
 )	

 
@2 
 15#+J  +J$+J )+J d38n-	+J
 +J 
+J +JrH   r!  c                   h     e Zd Ze	 	 ddej
                  dededee	e
ef      def
 fd       Z xZS )AlwaysWrapNestedWrappedModulerq   r
  r   r  r   c                 0   t         t        t          	 | t        j                  |||      }|t        j                  k(  r|S |t        j
                  k(  rB|xs i }t        |fdt        i|}|t        j                  k(  r|j                  t              }|S y)z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )rq   r
  r   r  r   r  N)r   r8  rc   rA   rE   rF   r   r   rK   rM   r   r   )rq   r
  r   r  r   rl   r  r   s          rI   rc   z"AlwaysWrapNestedWrappedModule.init  s     )+H
'//-#'  
 	 \111L|555%+KeX6HXKXJ>#>#>>']];7
 6rH   r6  )rB   rC   rD   ri   rw   r  rA   rK   r   r   r   r   r  rc   r  r  s   @rI   r8  r8    s^    
 15#  $ ) d38n-	
  rH   r8  c                        e Zd Zdej                  dededef fdZed
d       Z	e	 	 ddej                  de
dedeeeef      def
d	       Z xZS )NonUniformReqGradNWMrq   r   r   r   c                    t         t        |           j                         | _        j	                         | _        |t        j                  k(  }fd}|rt        j                  d       t        j                  t        t        j                  dd      |       |t        j                   |t        t        j                  dd      |            t        t        j                  dd      |                   |t        j                  t        t        j                  dd      |      t        t        j                  dd      |                        | _        y )Nc                 &    rt        | fi S | S r   r   r$  s    rI   r&  z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrap.  r'  rH   r   r   r7   r   )r   r!  r   r   r   r   rK   rL   rg   r   rj   r(  r   r   r)  r*  s	    ``  `  rI   r   zNonUniformReqGradNWM.__init__  s     	!413 JJL	**,)^-I-II	
 a mmBIIaO^<		!R0@. QR#BIIb"$5~F #BIIb!$4nE#BIIaO^D
rH   c                     | j                         D ]-  \  }}t        j                  ||      r|j                  d       / y r6  )rt   rematchrequires_grad_)rl   req_grad_masknps       rI   _set_nonuniform_req_gradz-NonUniformReqGradNWM._set_nonuniform_req_gradE  s:    **, 	(DAq88M1-  '	(rH   r
  r  c                    t        j                  d      }|t        j                  k(  r't	        | d||      }t        j                  ||       |S |t        j                  k(  rT|i }t	        | fd||d|}|t        j                  k(  r|j                  t              }t        j                  ||       |S t        d|       )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr5  Tr  )r?  compilerA   rE   r;  rE  rF   rK   rM   r   r   r  )rq   r
  r   r  r   req_grad_pattern	ddp_modelr  s           rI   rc   zNonUniformReqGradNWM.initK  s    ( ::&9:\111,!1+	I !99)EUV|555" -!1+	
 J  >#>#>>']];7
 99*FVW77GHIIrH   rd   r6  )rB   rC   rD   rw   r  r  rK   r   ri   rE  rA   r   r   r   r   rc   r  r  s   @rI   r;  r;    s    (
  (
 (
 )	(

 (
T ( (
 
 15#+J  +J$+J )+J d38n-	+J
 +J +JrH   r;  c                        e Zd ZdZdej
                  dedef fdZd Zd Z	d Z
d	 Zed
ee   dedededef
d       Z xZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r)  delay_after_loss_msdelay_before_reduction_msc                 L    t         |           || _        || _        || _        y r   )r   r   rL  rM  r)  )rT   r)  rL  rM  r   s       rI   r   zModuleWithDelay.__init__~  s'     	#6 )B&rH   c                 8    | j                   j                  |      S r   )r)  rV   rS   s     rI   rV   zModuleWithDelay.get_input  s    {{$$V,,rH   c                 $    | j                  |      S r   r.  r/  s     rI   r   zModuleWithDelay.forward  r0  rH   c                 B   | j                   j                  ||      }| j                  dkD  rst        st        r$t        j                  | j                  dz         |S t        r=t        j                  j                  t        | j                  t               z               |S Nr     )r)  r[   rL  r4   r5   timesleepr3   rg   r8   _sleepr   r1   r2  s       rI   r[   zModuleWithDelay.get_loss  sy    {{##E62##a'8

433d:;  

!!#d&>&>ARAT&T"UVrH   c                      t         j                  j                   fd}t        j                  d|      5   j
                  j                  |       d d d        y # 1 sw Y   y xY w)Nc                     j                   dkD  rrt        r>t        j                  j	                  t        j                   t               z               n.t        st        r"t        j                  j                   dz          | i |S rR  )rM  r3   rg   r8   rV  r   r1   r4   r5   rT  rU  )r`   ra   orig_reduce_scatterrT   s     rI   _delayed_reduce_scatterz=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatter  sk    --1JJ%%D::=N=PPQ JJt==DE&777rH   z'torch.distributed.reduce_scatter_tensor)rg   distributedreduce_scatter_tensorr   patchr)  r_   )rT   r^   rZ  rY  s   `  @rI   r_   zModuleWithDelay.run_backward  sW    #//EE	8 ZZ57N
 	+ KK$$T*	+ 	+ 	+s   AA'module_class
model_argsmodel_kwargsc                <    t         | j                  |i |||      S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )rK  rc   )r^  rL  rM  r_  r`  s        rI   rc   zModuleWithDelay.init  s,    * Lz:\:%
 	
rH   )rB   rC   rD   re   rj   rk   r   r   rV   r   r[   r_   ri   typerP   r   rc   r  r  s   @rI   rK  rK  z  s    F				 !	 $'		-+$ 
=)

 !
 $'	

 
 
rH   rK  c                   ~    e Zd Zeej
                  ddddfdej                  dedede	e
eef      ded	ed
efd       Zy)NestedWrappedModuleWithDelayNFr   rq   r
  r   r  r   rL  rM  c           
      D    t         j                  t        | ||||||      S )Nrq   r
  r   r  r   rL  rM  )rK  rc   r!  rf  s          rI   rc   z!NestedWrappedModuleWithDelay.init  s4     ##)-#' 3&? $ 	
 		
rH   )rB   rC   rD   ri   rK   rM   rw   r  rA   r   r   r   r   r  r   rc   rG   rH   rI   rd  rd    s     ,:+F+F04##$)*
  
$
 )
 d38n-	

 
 !
 $'
 
rH   rd  c                   $     e Zd Z fdZd Z xZS )DummyDDPc                 0    t         |           || _        y r   )r   r   r)  )rT   r)  r   s     rI   r   zDummyDDP.__init__  s    rH   c                 &     | j                   |i |S r   r.  rT   r`   ra   s      rI   r   zDummyDDP.forward  s    t{{D+F++rH   rB   rC   rD   r   r   r  r  s   @rI   rh  rh    s    ,rH   rh  c                        e Zd Zdej                  dedededef
 fdZd Z	d Z
e	 	 	 ddej                  d	eded
eeeef      dedefd       Z xZS )MixtureOfExpertsrq   r   r   delay_before_free_msr   c                    t         |   ||||       || _        || _        || _        |t
        j                  k(  | _        |r"t        j                  d| j                  z          d}d}d}	t        t        j                  ||      | j                        }
t        d |
j                         D              | _        |
j                         D ]	  }d|_         |rt        j                  d       t        t        j                  ||      | j                        }|rHt        j$                  j'                  |j                         g      }t)        |
|fi |}
t)        ||fi |}t        j*                  t        t        j                  |	|      | j                        ||
t        t        j                  ||	      | j                              | _        y )	N)rq   r   r   r   *   r   r   r   c              3   <   K   | ]  }|j                           y wr   )numel).0rD  s     rI   	<genexpr>z,MixtureOfExperts.__init__.<locals>.<genexpr>  s     $L1QWWY$L   Tr   )r   r   rq   ro  r   rK   rL   r   rg   r   r   r   rj   r   r  r   num_expert_paramsexpertr[  	new_groupr   r(  r)  )rT   rq   r   r   ro  r   r  d_expertd_sharedd_inputrx  rD  sharedexpert_groupr   s                 rI   r   zMixtureOfExperts.__init__  s    	-'	 	 	
 
$8!"..2N2NNb499n- 8X!>@S@ST!$$L8I8I8K$L!L""$ 	AAH	 a  8X!>@S@ST ,,66L &,>+>F&%7;7FmmBIIgx8$:M:MNBIIh8$:M:MN	
rH   c                 f     j                   dkD  r j                  d   }t        |t              ret        j
                  j                  j                  j                   fd}t        j                  d|      5   j                  |      cd d d        S  j                  |      S # 1 sw Y   xY w)Nr   r   c                      t         r>t        j                  j                  t	        j
                  t               z               n.t        st        r"t        j                  j
                  dz          | i |S )NrS  )r3   rg   r8   rV  r   ro  r1   r4   r5   rT  rU  )r`   ra   orig_reshardrT   s     rI   _delayed_reshardz2MixtureOfExperts.forward.<locals>._delayed_reshard"  s]     

)) 9 9<M<O OP "X

4#<#<t#CD'888rH   z.torch.distributed.fsdp._runtime_utils._reshard)ro  r)  r  r   rg   r[  fsdp_runtime_utils_reshardr   r]  )rT   r   rx  r  r  s   `   @rI   r   zMixtureOfExperts.forward  s    $$q([[^F&$'$0055DDMM9 ZZDFV *  ;;q>* *
 {{1~* *s   ;B''B0c                    |j                          | j                  st        j                         5  | j	                         D ]v  }t        |d      r|j                  |j                  j                  | j                         t        j                  j                  |j                  | j                         x 	 d d d        y y # 1 sw Y   y xY w)Nrx  rp   )r  r   rg   r   r   hasattrgraddiv_r   r[  
all_reducerq   )rT   r^   rD  s      rI   r_   zMixtureOfExperts.run_backward4  s    ~~ O* OAq(+ vv)DOO4))44QVV4::4NOO O O Os   -CACCr
  r  c                    |i }|t         j                  k(  rt        | d|||      S |t         j                  k(  r;t        | fd|||d|}|t        j
                  k(  r|j                  t              }|S t        d|       )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        F)r   r   ro  r   Tr  )	rA   rE   rn  rF   rK   rM   r   r   r  )rq   r
  r   r  r   ro  r  s          rI   rc   zMixtureOfExperts.init@  s    4 K\111#!1%9+  |555)!1%9+ J  >#>#>>']];7
77GHIIrH   )NFr   )rB   rC   rD   rw   r  r  rK   r   r   r   r_   ri   rA   r   r   r   r   rc   r  r  s   @rI   rn  rn    s    2
  2
 2
 )	2

 "2
 2
h0
O 
 15#$%0J  0J$0J )0J d38n-	0J
 0J "0J 0JrH   rn  c                        e Zd Z	 ddddddedeej                     deded	ef
 fd
Zdej                  dej                  fdZ
d Z xZS )MLPTFr7   )biaswith_bufferdim_multiplierdimrU   r  r  r  c                
   t         |           t        j                  |||z  ||      | _        t        j                  ||z  |||      | _        |r)| j                  dt        j                  |f|             y d | _	        y )N)rU   r  r   r   )
r   r   rj   r   in_projout_projr   rg   randnr   )rT   r  rU   r  r  r  r   s         rI   r   zMLP.__init__u  so     	yyns&:6PTU		.3"6FQUV  5;;vf+MNDKrH   r   rQ   c                     | j                  |      }t        j                  |      }| j                  |      }t        j                  |      }| j                  || j                  z   }|S r   )r  Frelur  r   )rT   r   zs      rI   r   zMLP.forward  sS    LLOFF1IMM!FF1I;;"DKKArH   c                     | j                   4t        j                  j                  j	                  | j                          y y r   )r   rg   rj   rc   normal_r   s    rI   reset_parameterszMLP.reset_parameters  s+    ;;"HHMM!!$++. #rH   r   )rB   rC   rD   r   r   rg   rU   r  r   rh   r   r  r  r  s   @rI   r  r  t  sv     *.
 ! &
   " %,, /rH   r  c                   F     e Zd Zdddedef fdZdededed	d fd
Z xZS )MLPStackF)with_seq_parallelmlp_dimr  c                    t        |d      t        |      t        |d      g}|r&|j                  t        j                  |d             t	        |   |  || _        y )N   )r  Fr  )r  appendrj   	LayerNormr   r   r  )rT   r  r  modulesr   s       rI   r   zMLPStack.__init__  sX     *L*	$
 NN2<<e<='"!2rH   tp_meshdp_meshuse_activation_checkpointingrQ   c           
         t        d      t        d      t        d      t        d      t        d      | j                  rt        t        d            n	t               d}| j                  rt	        d      |d<   t        | ||       | D ]8  }t        |t        j                        r|rt        |       t        |fd	|i| : t        | fd	|i| | S )
NF)use_local_outputr?   )output_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)sequence_dim3)device_meshparallelize_planmesh)r%   r'   r  r$   r(   r&   r  rj   r  r   r   )rT   r  r  r  r  r  r)  s          rI   parallelizezMLPStack.parallelize  s     )%@)5A(%@)5A(%@%% *qB "
 !!$4!$DS!4WGWX 	=F&",,/+6"<W<<	= 	D6w6+6rH   )	rB   rC   rD   r   r  r   r   r  r  r  s   @rI   r  r    sD    BG 
3 
34 
3  '+	 
rH   r  c                        e Zd ZdZddedef fdZdej                  de	e
ej                  ej                  f   ej                  f   fdZ xZS )	DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    r  use_second_linearc                     t         |           t        j                  ||      | _        t        j                  ||      | _        t        j                         | _        || _        y r   )	r   r   rj   r   lin1lin2ReLUr  r  )rT   r  r  r   s      rI   r   zDoubleLinear.__init__  sG    IIc3'	IIc3'	GGI	!2rH   r   rQ   c                     | j                   r@| j                  | j                  |            | j                  | j                  |            fS | j                  | j                  |            S r   )r  r  r  r  r/  s     rI   r   zDoubleLinear.forward  sQ     !!99TYYq\*DIIdiil,CCCyy1&&rH   T)rB   rC   rD   re   r   r  r   rg   rh   r   rf   r   r  r  s   @rI   r  r    sT    
3C 3D 3''	uU\\5<</0%,,>	?'rH   r  new_all_gather_into_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rw   all_gather_into_tensorbarrier)r  orig_all_gathers     rI   patch_all_gatherr    sO     11OLLN"<D6&5# 	&5#   0A;A  A;!A88A;new_foreach_all_gatherc              #   :  K   t         j                  j                  j                  j                  j
                  }t        j                          | t         j                  j                  j                  j                  _        	 d  t        j                          |t         j                  j                  j                  j                  _        y # t        j                          |t         j                  j                  j                  j                  _        w xY wwr   )rg   r[  r  _fully_shard_fsdp_param_groupforeach_all_gatherrw   r  )r  orig_foreach_all_gathers     rI   patch_foreach_all_gatherr    s      	++==PP  	LLN 
''99L
# 	++==P 	# 	++==P    B DC ADA	DDnew_foreach_reducec              #   :  K   t         j                  j                  j                  j                  j
                  }t        j                          | t         j                  j                  j                  j                  _        	 d  t        j                          |t         j                  j                  j                  j                  _        y # t        j                          |t         j                  j                  j                  j                  _        w xY wwr   )rg   r[  r  r  r  foreach_reducerw   r  )r  orig_foreach_foreach_reduces     rI   patch_foreach_reducer    s      	++==LL   	LLN 
''99H
' 	++==L 	' 	++==Lr  new_reduce_scatter_tensorc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rw   r\  r  )r  rY  s     rI   patch_reduce_scatterr    sP     44LLN!:D9%8" 	%8"r  new_all_reducec              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )rw   r  r  )r  orig_all_reduces     rI   patch_all_reducer    sJ     ooOLLN$DO*) 	)r  new_unshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   unshardrw   r  )r  orig_unshards     rI   patch_unshardr  $  Q      "))LLLN(N.!- 	!-r  new_reshardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   reshardrw   r  )r  r  s     rI   patch_reshardr  1  r  r  new_post_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   post_backwardrw   r  )r  orig_post_backwards     rI   patch_post_backwardr  >  sR      (55LLN#4N :'9$ 	'9$r  new_backwardc              #      K   t         j                  }t        j                          | t         _        	 d  t        j                          |t         _        y # t        j                          |t         _        w xY wwr   )r   r  rw   r  )r  orig_backwards     rI   *patch_register_post_backward_hook_backwardr  K  sT      199MLLN,8 )>0=$- 	0=$-r  rY  r`   ra   c                     t        |      dkD  r|d   }nd|v r|d   }nt        d| d|        ||        ||i |S )Nr   rZ   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsrY  rn   r`   ra   rZ   s         rI   reduce_scatter_with_assertr  X  sa     4y1}a	V	!;D6F8T
 	
 f///rH   replicated_modulesharded_moduleprefixes_to_ignore.c                    t        |j                         |j                         d      D ]  \  \  }}\  }}|}|D ]  }	|j                  |	d      } | j                  ||       | j	                  |t
               t        |t
              sJ |j                  |j                  }}
t        |      t        d      t        d      fk(  rt        d      t        ||
|      }| j                  |j                         |j                                |j                  | j                  |j                         | j!                  |j                         t        |j                  |
|      }| j	                  |j                  t
               t        |j                  t
              sJ | j                  |j                  j                         |j                                 y )NTrr    r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)r{   rt   replaceassertEqualassertIsInstancer#   r  r  
placementsrf   r$   r  r"   to_localr  assertIsNoneassertIsNotNone)r  r  r  r  replicated_namereplicated_paramsharded_namesharded_paramclean_sharded_nameprefixr  r  sharded_ref_paramsharded_ref_grads                 rI   check_sharded_parityr  k  s    OR**,'')O TJ+*-JlM
 *( 	HF!3!;!;FB!G	H);<]G4-111(44m6N6Njq58 44 ;  ..>jQ..02C2L2L2NO  (]//0M../,-=-B-BD*U]//9-,,g666**3357G7P7P7RS5TrH   znot-support-multithreadc                   @     e Zd Zed        Z fdZd Zd Zd Z xZ	S )FSDPTestMultiThreadc                     t         S r   DEVICE_COUNTr   s    rI   r   zFSDPTestMultiThread.world_size      rH   c                 B    t         |           | j                          y r   )r   setUp_spawn_threadsrT   r   s    rI   r  zFSDPTestMultiThread.setUp  s    rH   c                      t        | g|i |S r   r.   rk  s      rI   r.   z FSDPTestMultiThread.run_subtests      D242622rH   c                 @    t         j                  j                          y r   rg   _dynamoresetr   s    rI   perThreadSetUpz"FSDPTestMultiThread.perThreadSetUp      rH   c                 @    t         j                  j                          y r   r  r   s    rI   perThreadTearDownz%FSDPTestMultiThread.perThreadTearDown  r  rH   )
rB   rC   rD   propertyr   r  r.   r  r  r  r  s   @rI   r  r    s)     3rH   r  c            $           e Zd Z fdZed        Zed        Zedefd       Zed        Z	d Z
d Zd	 Zd
 Zed        Z	 	 	 	 	 	 	 d'dej"                  dedededee   dedee   dededeeeef      fdZddd e       dddddddddfdee   dededee   deded ed!ee    d"ee!   dee   d#ed$ededed%eeeef      deeeef      f d&Z" xZ#S )(FSDPTestc                 h    t         |           dt        j                  d<   | j	                          y )N0TORCH_NCCL_DESYNC_DEBUG)r   r  osenviron_spawn_processesr  s    rI   r  zFSDPTest.setUp  s)     14

,-rH   c                     t         S r   r  r   s    rI   r   zFSDPTest.world_size  r	  rH   c                 >    t         j                  j                         S r   )rw   distributed_c10d_get_default_groupr   s    rI   rm   zFSDPTest.process_group  s    $$7799rH   rQ   c                      yr6  rG   r   s    rI   destroy_pg_upon_exitzFSDPTest.destroy_pg_upon_exit  s     rH   c                 *    t          | j                   S r   )r0   	file_namer   s    rI   init_methodzFSDPTest.init_method  s    t~~.//rH   c                 <    | j                  ||j                         y r   )r  r   )rT   r  r   s      rI   _check_cpu_offloadzFSDPTest._check_cpu_offload  s    j&<&<=rH   c                 <    | j                  ||j                         y r   )r  backward_prefetch)rT   r  r.  s      rI   _check_backward_prefetchz!FSDPTest._check_backward_prefetch  s    *J,H,HIrH   c                 <    | j                  ||j                         y r   )r  forward_prefetch)rT   r  r1  s      rI   _check_forward_prefetchz FSDPTest._check_forward_prefetch  s    ):+F+FGrH   c                      t        | g|i |S r   r  rk  s      rI   r.   zFSDPTest.run_subtests  r  rH   c                     | |      }||_         ||_        |j                  dd      }t        d|j                    d|j                          t
        j                  j                         |j                  k  r3t        j                  t        d|j                      j                         	 |r`t
        j                  j                  j                  j                  j!                         }t#        j$                  d|j                  ||       nDt#        j$                  |j&                  t(        t+        |j                        |j                          d }
|j                   t0        z  }t2        st4        rt
        j                  j7                  |       |g}
t#        j8                  |
       t
        j:                  j=                          t?                |jA                  ||       t
        j:                  j=                          t#        j8                  |
       t#        jB                          y # t,        $ r=}	d	|	j.                  d
   v r&t        j                  t        d   j                          d }	~	ww xY w)Nfake_pgFzdist init r=z, world=z
multi-gpu-fake)backendr   r   store)r*  r7  r   r   	recompiler   backend_unavailable)
device_ids)"r   r)  getprintr   rg   acceleratordevice_countsysexitr/   	exit_codetesting	_internalr[  r5  	FakeStorerw   init_process_groupr*  DISTRIBUTED_BACKENDr   RuntimeErrorr`   r  r3   r5   set_device_indexr  r  r  r2   run_testdestroy_process_group)r  r   	test_namer)  pipera   rT   r5  r8  er;  	device_ids               rI   _runzFSDPTest._run  s   9~	"**Y.TYYKx/@AB))+doo=HHZ*T__,= >?IIJ	//;;CCMMO''"#	 '' $ 0 0/"4??3	 
II,	..y9[

 	
+i&
+""$1  	affQi'$9:DDE		s   -B&H/ /	I588I00I5NFrl   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc           	         |xr |j                   }t        |j                               j                  }|
i }
t	        dd|i|
}t
        j                  j                  |j                         |d      }t        |      D ]  }|j                          t
        j                  j                  t        |      5  |j                  j                  t        j                  t                    }|	s|rMt        |t               s=t        |t
        j"                        r|j%                         }nt'        d |D              } || }|rft        |t               rV|j(                  t*        vrD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j                  j/                  ||      j1                  |      }d d d        |j3                        }|s&|	s$|j4                  t
        j6                  k(  sJ d       |	r+| j-                  |j4                  t
        j8                         net        |t               r+|J | j-                  |j4                  |j:                         n*| j-                  |j4                  t
        j6                         |j                  j=                  |       |rTt        |t               rD|j                         D ]1  }| j-                  |j                  t        j                  d             3 |j?                  |       |jA                          |s|jC                         jE                         D ci c]  \  }}||jG                          }}}tI        |       |jK                  |        t        |t               r|jM                  tN        jP                         jS                         S # 1 sw Y   xY wc c}}w )	Nenabledg?)rS  momentum)r[  c              3   <   K   | ]  }|j                           y wr   )r   )rt  r   s     rI   ru  z4FSDPTest._train_for_several_steps.<locals>.<genexpr>   s     %>1affh%>rv  r=   zeloss data type should be float32, as the original                     parameter data type is float32.rG   )*offload_paramsnextr   rU   r   rg   optimSGDry   	zero_gradamprR  r   r)  rV   r  r   rh   r   rf   r  r   r  r[   r   scaler   float32float16param_dtyper_   stepupdater   r   cloner   load_state_dict_assert_stater   IDLEru   )rT   rl   rQ  rR  rS  rT  rU  rV  rW  rX  rY  cpu_offload_paramsmodel_devicesharded_grad_scalerr`  r   rY   rZ   rD  r^   kvr   s                          rI   _train_for_several_stepsz!FSDPTest._train_for_several_steps  s4    .Q2B2Q2QE,,./66%-)+&/ 
.
2L

  0 0 2rCHy! 9	2AOO##K#B M..u||K/HI _Zt=T!%6 %

 %%>%> > '"5$/ //>? #--/ H((5<<3FGH ||,,UF;>>|L-M. ',,T2D"=zzU]]2 52
 !$$TZZ?t,*666$$TZZ1L1LM$$TZZ?LL%%d+!j&=))+ DA$$QXXu||E/BCD  $$U+&&(7<7G7G7I7O7O7QRtq!alR
R E"%%j1s9	2v eT" 2 23{{}wM Mf Ss   9DOO(O%	r   Tmodel_classr
  r   ref_init_fn	num_itersr   r.  r  r1  use_orig_paramsinit_kwargsc                    |t         j                  k7  sJ d       |i }d}| j                  j                         } |j                  | j                  t         j                  t
        j                  fddi|}|.t        rt        |t        gt              }nt        ||g|      }n ||      }|r|j                         }| j                  |||
du|||
|||	      }t        |j                               }|j                  |||	|
||d       	  |j                  | j                  |||fddi|}t%        |t&              st'        || j                  fi |}|r|j                         }|t
        j(                  k(  r|j+                  t              }|duxr |j,                  }|xr |t
        j(                  k(  }|xr |t
        j(                  k7  }|rFt/        j0                  d      }|j                         D ]  }| j3                  |j0                  |         |r| j5                  t6        dt               n	t9               }|5  | j                  ||d||||
|||
      } ddd       |ry|r[t/        j0                  d      }|j                         D ]  }| j3                  |j0                  |          j+                  t              } t;        |      }!t.        j<                  j?                  | d       |
|s| j3                  ||!dd       yyy# t        $ r }t!        d	| d
t#        |             |d}~ww xY w# 1 sw Y   xY w)a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPN{Gz?r   T)r;  output_device)rR  rS  rT  rV  rW  rX  rY  )r   r.  r  rV  r1  rw  zInitializing z raised error r=   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)rR  rS  rT  rU  rV  rW  rX  rY  )check_dtypezFSDP did not match DDP)exact_devicemsg) rA   rE   rm   r   rc   rK   rL   r4   DDPr   r   rs  r   r   ri  	Exceptionr  r   r  r   rM   r   r^  rg   rU   r  assertRaisesRegexrH  r   r   rC  assert_close)"rT   rt  r
  r   ru  rv  rU  r   r.  r  rV  r1  rw  rW  rX  rx  rY  r  rS  r   rl   	ref_modelref_loss
ddp_paramsr  rN  r^  expects_device_errorexpects_cpu_device
cpu_devicer}   context	fsdp_lossfsdp_unsharded_paramss"                                     rI   _test_fsdp_parityzFSDPTest._test_fsdp_parityV  s   D !5!55 	
<	
5 K!!&&(     ((
 	

 
 {m;	  4&M	#E*I!(I00$D0(+'A''A 1 

 )..01
*%6%6#2$4#2		

	Y)))"" 	
 # J *d+ j$*<*<LLJ#*J~:::#{3J$D0O[5O5O
 N/>3N3NN 	 N/>3N3NN 	 e,J#..0 ;  z:; $ ""%%0M3  	  	55!,% /+E++E 6 I	   e,J#..0 ;  z:;![1I /
 ; 	""8YE"J "=%!,	   ,9"K  	Y}[MAxPQWXX	YF	 	s$   "L L?	L<L77L<?M)rz  NFNFFN)$rB   rC   rD   r  r  r   rm   r  r'  r*  r,  r/  r2  r.   classmethodrP  rj   rk   r   floatr   r   r   r   r   r   rs  rb  rP   rA   rK   r   r   r   r  r  r  s   @rI   r  r    sC       : : d   0 0>JH3 4% 4%v 15 48+0#?CUyyU U 	U
 U #:.U U ".1U %)U U %-T#s(^$<Ux +/",,8<8<48!& %+0#04?C#g-(g %g )	g
 h'g g g  g $$45g $$45g ".1g g g %)g g  d38n-!g" %-T#s(^$<#grH   r  compile_compute_on_modulec                 @      fd G d dt               fd}|S )Nc                      t        j                  j                  j                  | i | t	        | d         r| d   j                          y y )Nr   )rg   r[  r  r   r  rG  )r`   ra   r  s     rI   !fully_shard_with_compiled_computez=compiled_fsdp_test.<locals>.fully_shard_with_compiled_compute  sN    **D;F;$,
G.1
 GOO1
rH   c                   (    e Zd Z e       Z e       Zy)*compiled_fsdp_test.<locals>.FullyShardModeN)rB   rC   rD   r   EAGERCOMPILED_COMPUTErG   rH   rI   FullyShardModer    s    6rH   r  c                 4     t                fd       }|S )Nc                     t         j                  j                  j                  }D ]  }|j                  k7  r"t               st        j                  dd       5t         j                  j                  j                  }t         j                  j                  j                  }t         j                  j                          |j                  k(  r|}n^|j                  k(  rAdt         j                  j                  _
        dt         j                  j                  _        }nt        d|       |	j                   |j"                  <    	| i | t         j                  j                          |	j                   |j"                  <   |t         j                  j                  _
        |t         j                  j                  _         y )Nz0Inductor on GPU needs Triton and recent GPU archr   )
stacklevelTr?   z!Need to implement FullyShardMode=)rg   r[  r  r   r  r6   warningswarnr  configskip_fsdp_hooks	_inductorcompile_threadsr  r  NotImplementedError__globals__rB   )
r`   ra   original_fully_shardmodeoriginal_skip_fsdp_hooksoriginal_compile_threadsfully_shard_patchr  r  funcs
          rI   wrapperz6compiled_fsdp_test.<locals>.decorator.<locals>.wrapper  sk   (-(9(9(>(>(J(J & R>///
MMJWX +0==+?+?+O+O(+0??+A+A+Q+Q(!!))+>///(<%^<<<;?EMM((8=>EOO**:(I%-;D6B  CT  !5!>!>?d%f%!!))+BV  !5!>!>?7O$$49Q&&6=RrH   r
   )r  r  r  r  s   ` rI   	decoratorz%compiled_fsdp_test.<locals>.decorator  s#    	t 	R 
 	RD rH   )r	   )r  r  r  r  s   ` @@rI   compiled_fsdp_testr     s"    " "$L rH   c                   &     e Zd Zd fdZd Z xZS )
SkipModulec                 \    t         |           t        j                  ddd      | _        y N
   Fr  )r   r   rj   r   linr  s    rI   r   zSkipModule.__init__6  s"    99R%0rH   c                 $    | j                  |      S r   )r  r/  s     rI   r   zSkipModule.forward:  s    xx{rH   rd   rl  r  s   @rI   r  r  5  s    1rH   r  c                   $     e Zd Z fdZd Z xZS )NestedLinearc                     t         |           |r:t        t        j                  ddd      j                  t                    | _        y t        j                  ddd      j                  t              | _        y r  )r   r   r!   rj   r   r   r   nested_linear)rT   	fsdp_wrapr   s     rI   r   zNestedLinear.__init__?  sV    !%biiBU&C&F&F{&S!TD!#2r!>!A!A+!NDrH   c                 $    | j                  |      S r   )r  r/  s     rI   r   zNestedLinear.forwardF  s    !!!$$rH   rl  r  s   @rI   r  r  >  s    O%rH   r  c                   $     e Zd Z fdZd Z xZS )	SkipModelc                    t         |           t        j                  ddd      j	                  t
              | _        t               j	                  t
              | _        t        t        |      t
              | _        y )Nr  Fr  )r  )rO  )r   r   rj   r   r   r   linearr  linear_skipr!   r  r  )rT   double_nestr   s     rI   r   zSkipModel.__init__K  sW    iiBU366{C%<??;7!;/;
rH   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r/  s     rI   r   zSkipModel.forwardS  s4    KKNQq!rH   rl  r  s   @rI   r  r  J  s    
rH   r  )FT)FFr  )rG   r   )
contextlibr  r?  r@  rT  unittestr  abcr   r   collections.abcr   r   copyr   enumr   r	   	functoolsr   typingr   r   r   r   r   r   rg   torch.distributedr[  rw   torch.nnrj   torch.nn.functionalr  r  torch.distributed._composabler   torch.distributed.device_meshr   torch.distributed.fsdpr   r   r   r   $torch.distributed.fsdp._common_utilsr   5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   "torch.distributed.fsdp._init_utilsr   2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   *torch.distributed.fsdp.sharded_grad_scalerr   torch.distributed.fsdp.wrapr   r    r!   torch.distributed.tensorr"   r#   r$   !torch.distributed.tensor.parallelr%   r&   r'   r(   r)   r*   torch.nn.parallel.distributedr+   r  *torch.testing._internal.common_distributedr,   r-   r.   r/   $torch.testing._internal.common_utilsr0   r1   r2   r3   r4   r5   torch.utils._tritonr6   r  r   rG  r8   r?  r;   rA   rK   rk   rP   r  r   r   r  r   r   r   r   r   r   r   r   r   r!  r8  r;  rK  rd  rh  rn  r  r(  r  r  contextmanagerr  r  r  r  r  r  r  r  r  r  rf   r   r  skipIfr  r  rb  r  r  r  r  rG   rH   rI   <module>r     s    	 	 
    # $ "    < <        4 4 
 ? S 
 I R R F F  F H   + K ::**,LK K 99))+LK L4 T BIIs 499$$ >% #99##""2299 2t 2>299 >d >DBII D$ D .Q"- Q"h[J- [J|$7 D]J. ]J@J
m J
Z
? 
.,ryy ,JJ* JJZ/")) /@*r}} *Z'299 '6 6 6 6 
X 
 
" 
X 
 
" 9H 9 9 *X * * .x .  . .x .  . :8 :  : >X >  >0!0 0 	0
 0. +-	 Tyy T II T c3h	 TF 45/  6&[# [|
2(4. 2j 	%299 	%		 rH   