
    qi                     ,   d dl Z d dlZd dlZd dlZd dlmZmZmZmZ d dl	m
Z
mZmZmZmZ d dlZd dlmZ d dlmc mc mZ d dlmc mc mZ d dlmc mc mZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/m0Z0m1Z1 d d	l2m3Z3 d d
l4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlCmDZD erd dlEmFZF dZG	 d dlHmIZImJZJ dZLdZMeNej                  ej                  f   ZPeeej                  ePf      ZQe:j                  e1j                  e:j                  e1j                  e:j                  e1j                  e:j                  e1j                  e:j                  e1j                  iZWe:j                  e:j                  gZXe:j                  e:j                  fZYe	 d_de'deQde:dee>   dee"   de'fd       ZZede'deQde"de'fd       Z[ede
de\fd       Z]ede"de\fd       Z^ede_dej                  fd        Z`ed!ej                  de_dej                  fd"       Zad!ej                  de_deNej                  ej                  f   fd#Zbe	 d_de'd$ej                  d%eeej<                  j                        d&eeeej<                  j                        eeej<                  j                        f   de'f
d'       Zed&efe
   d(e\ddfd)Zgede'd$ej                  d*ehej                     d+eee_ej                  f      de'f
d,       Zjede'd$ej                  de'fd-       Zkede'dee:   d.ee9   d/ee6   d0e\d1e\d2e_d3e_de'fd4       Zlede'de'fd5       Zmede'd6e5d7e\de'fd8       Zned_de'de"de'fd9       Zoede'de'fd:       Zpd$ej                  d;efej                     ddfd<Zqede'd=ej                  d+eee_ej                  f      d>eeej                  gdf      d?e\de'fd@       Zrede'd;efej                     d=ej                  fdA       ZsdBej                  dCeeej<                  j                        dehej                     fdDZt	 d_dBej<                  j                  d%ehej<                  j                     dEeeej<                  j                        dehej<                  j                     fdFZudBej<                  j                  d%ehej<                  j                     dehev   fdGZwdBej                  dehev   fdHZxd$ej                  d*ehej                     d+eee_ej                  f      ddfdIZyd+eee_ej                  f      dJe_dKe&deej                     fdLZzd$ej                  d*ehej                     d%ehej                     deNe\e\f   fdMZ{dBej                  d>eej                  gdf   d%ehej                     ddfdNZ|dBej                  dOeej                     d%ehej                     dKe&fdPZ}dBej                  d%ehej                     defej                     fdQZ~d$ej                  d*ehej                     dRehej                     dOeej                     ddf
dSZd;efej                     dTefej                     dOeej                     ddfdUZdV Zd$ej                  d*ehej                     dOeej                     dJe_dKe&dej                  fdWZd$ej                  d;efej                     dej                  ddfdXZdYefej                     ddfdZZd$ej                  d*ehej                     deej                     fd[Zd*ehej                     ddfd\Zde:fd]Zdej                  de j                  fd^Zy# eK$ r dZGY w xY w)`    N)Callable	GeneratorIterableIterator)Anyno_type_checkOptionalTYPE_CHECKINGUnion)default_hooks)
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  _fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 (   ||t        d      |t        v }|r#|||t        d| d      t        | ||      } n4|r|| _        |j	                  d      | _        n||n	t               | _        | j
                  j                         | _        | j
                  j                         | _	        | j                  }|r|| j                  j                         z  }t        j                  j                  |      | _        || j                  z  | _        | S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   mesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr,   r   ranksize
world_size_inter_node_pgr   DefaultState_get_gradient_predivide_factor_gradient_predivide_factor_gradient_postdivide_factor)r+   r,   r-   r.   r/   is_hybrid_strategydata_parallel_world_sizes          h/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_staterD   Y   sI     [%<<
 	
 +.HH V^8K '(9': ;S S 
 ?}kE !,E"-"7"7"7"CE "/!:@R@T  $$))+EJ**//1E$// E$8$8$=$=$?? ""AA$	
 
$ 	!5#C#CC 
% L    c                    |rYt        |      r6|| _        |j                  d      | _        |j                  d      | _        nt        d|j                         |@t               }t        || j                  j                               \  }}|| _        || _        n2t        |      r|\  | _        | _        nt        dt        |             t        | j                        | _        | S )Nr   r2      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r,   )"_is_valid_hybrid_shard_device_meshr7   r8   r<   r,   r4   ndimr   !_init_intra_and_inter_node_groups_device_handledevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_state_inter_node_state)r+   r,   r/   default_groupintra_node_groupinter_node_groups         rC   r6   r6      s
    -k:!,E $/#8#8!#8#DE "-"7"7"7"CE>{?O?O>PQ  
	*,-N5//<<>.
** // *-8 9F5E!5GGKMGZF[] 
 ;**E LrE   c                 j    t        | t              xr" t        |       dk(  xr t        d | D              S )N   c              3   P   K   | ]  }t        |t        j                           y wN)
isinstancedistProcessGroup).0pgs     rC   	<genexpr>z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>   s     Jb
2t001J   $&)rY   tuplelenallrH   s    rC   rN   rN      s:     	=%( 	K!#	KJMJJrE   c                 D    t        | t              xr | j                  dk(  S )NrV   )rY   r   rJ   )r/   s    rC   rI   rI      s    k:.H;3C3Cq3HHrE   num_devices_per_nodec                 6    t        j                  |       \  }}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )rZ   new_subgroups)rd   intra_node_subgroup_s      rC   _init_intra_node_process_groupri      s!     "//0DErE   global_process_groupc                 \   d}t        j                  |       }t        j                  |       }||z  }t        j                  |       |z  }t	        |      D ]?  }t	        |      D cg c]
  }|||z  z    }	}t        j
                  |	|      }
||k(  s>|
}A |t        | d      |S c c}w )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    N)ranksbackendz. expected to assign inter-node pg, but did not)rZ   get_backendget_world_sizeget_rankrange	new_groupAssertionError)rj   rd   inter_node_pgsharding_backendr;   	num_nodesmy_local_rank
local_rankiranks_for_inter_groupgrps              rC   _init_inter_node_process_groupr|      s      M''(<=$$%9:J22IMM"67:NNM01  
=B9=M!
89J!223!
 !
 nn#8BRS&M  oKL
 	
 !
s   %B)c                 0    t        |      t        | |      fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )ri   r|   )rj   rd   s     rC   rK   rK      s#     	'';<&';=QR rE   moduleignored_modulesignored_statesc                    ||t        d      d }|d u}|rt        |      }t        |d       ng }t        |t        |      ng d       t        |      dkD  r"t	        |d   t
        j                        r|}n|}t        ||      | _        t        || j                  |      | _
        t        || j                        | _        | S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r4   list_check_ignored_statesra   rY   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r+   r~   r   r   ignored_parameterspassed_as_ignored_statesignored_states_lists          rC   _init_ignored_module_statesr     s     "~'A:
 	
 -T9">2148 %4%@D!b%	
 !#)!,bll;!41O1&/JE/E
 #<#E LrE   r   c                    t        |       dk(  ry|r`t        d | D              }t        d | D              }|s9|s6t        | D ch c]  }t        |       c}t              }t        d|       yyt        d | D              s6t        | D ch c]  }t        |       c}t              }t        d|       yc c}w c c}w )	z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc              3   P   K   | ]  }t        |t        j                           y wrX   )rY   r   r   r\   r+   s     rC   r^   z(_check_ignored_states.<locals>.<genexpr>J  s     UUE2<<8Ur_   c              3   P   K   | ]  }t        |t        j                           y wrX   rY   r   Moduler   s     rC   r^   z(_check_ignored_states.<locals>.<genexpr>K  s     S5*UBII6Sr_   )keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c              3   P   K   | ]  }t        |t        j                           y wrX   r   r   s     rC   r^   z(_check_ignored_states.<locals>.<genexpr>T  s     LE:eRYY/Lr_   z>ignored_modules expects nn.Module list elements but got types )ra   rb   sortedrO   reprr4   )r   r   
all_paramsall_modulesr+   sorted_typess         rC   r   r   ?  s     >aUnUU
SNSS+!N"K54;"KQUVL**69  #.z L^LL!N"K54;"KQUVL%(  M #L #Ls   B;C ignored_params	device_idc                 6   d}|1t        |t        j                        r|nt        j                  |      }|t        ||      D ]|  }|j                  j                  dv r||j                  }+|j                  j                  |j                  k7  sOt        d|j                   d|j                  j                          |xs t        j                  j                         }|j                  dk(  rt        d      t        j                  |      | _
        | S )a=  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current accelerator device.
    See the :ref:`Accelerators<accelerators>` for details.


    This method will be called once ignored parameters was determined, as the device handle maybe needed
    for other initialization.
    N>   cpumetazLFSDP does not support modules with different device types but got params on z and r   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rY   torchdevice_get_orig_paramsrO   RuntimeError_C_get_acceleratorr   from_devicerL   )r+   r~   r   r   determined_deviceparams         rC   _init_device_handler   \  s   (  )U\\2 i( 	
  %fn= 
	E||  O3 ($)LL!<<$$(9(>(>>&-->-C-C,DE%,,J[J[I\^ 
	 .L1J1J1L!!U*a  -889JKELrE   c                     t        |      | _        i }|j                         D ]  \  }}t        |      }|j                  ||<   ! || _        | S rX   )_get_buffer_names_buffer_namesnamed_buffersr   dtype_buffer_name_to_orig_dtype)r+   r~   r   buffer_namebuffers        rC   _init_buffer_stater     s_    
 ,F3E
 :<%335 ?V'428,,";/? (BE$LrE   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                    | j                   dk(  rQ|t        j                  k7  r-t        j                  d|xs t        j
                   dd       t        j                  }n/|t        j                  k(  rt        j                  dt        d       |xs t        j
                  | _        |xs
 t               | _	        |5t        j                  j                  dt        | j                                t        j                  j!                  t"        d	      d
k(  | _        |xs
 t'               | _        || _        || _        t.        j0                  | _        d | _        t7               | _        t;        j<                         | _        tA        jB                  | j>                  ||      | _"        d | _#        i }|| _$        d }	|	| _%        g }
|
| _&        | S )NrG   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.rV   
stacklevelzoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   z'torch.distributed.fsdp.mixed_precision. 1)'r;   r    NO_SHARDwarningswarn
FULL_SHARDFutureWarningr-   r   r   r   r   _log_api_usage_oncestrosenvirongetr   _use_full_prec_in_evalr   r   r   _use_orig_paramsr   IDLEtraining_state_is_rootr   _free_event_queuerZ   get_debug_level_debug_levelexec_order_utils_ExecOrderData_exec_order_data_unshard_event_fully_sharded_module_to_handle_handleparams)r+   r-   r   r   r   r   r   r   r   r   r   s              rC   _init_core_stater     s    1 0 9 99MMA$C(8(C(CD E'' 	 -55	.77	7< 	
 0N3C3N3NE+?~/?E"$$5c%:O:O6P5QR	
 	

2B73> 
  $3z|E/E,E(--EEN-/E--/E-<<E
  E IK#,KE) *.GEM"$FELLrE   c                 f    g }|| _         g }|| _        g }|| _        d| _        d | _        d | _        | S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handles_sync_gradients
_comm_hook_comm_hook_state)r+   r   r   r   s       rC   _init_runtime_stater     sK     8:&?E#24!5E35"7E EE!ELrE   backward_prefetchforward_prefetchc                 "    || _         || _        | S rX   )r   r   )r+   r   r   s      rC   _init_prefetching_stater     s     0E-E LrE   c                     ||j                         nd }|r+|| j                  k7  rt        | j                        | _        | S d | _        | S rX   )_get_root_meshr7   r$   rL   _fsdp_extension)r+   r/   	root_meshs      rC   _init_extensionr      sT    
 1<0G**,TI yE$6$66 1%2F2F G
 L !%LrE   c                     t         j                  | _        t               }t	               | _        || _        i }|| _        | S rX   )r"   FULL_STATE_DICT_state_dict_typer   r   _optim_state_dict_config_state_dict_config_unshard_params_ctx)r+   state_dict_configunshard_params_ctxs      rC   _init_state_dict_stater     s?    *::E)<)>%=%?E"0E57 2ELrE   r   c                     |D ]X  }t        |j                        dk(  sd}| j                         D ]  \  }}||u s|} n |st        d      t	        d| d       y)z
    Verify if the parameters are accepted by FSDP. The only restriction now
    is that the parameter cannot be a scalar tensor (param.shape == []).
    r   r   zExpected param_name to be setz/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)ra   shapenamed_parametersrs   r4   )r~   r   r   
param_namenameparam_s         rC   _verify_managed_paramsr     s    
  u{{q J & 7 7 9 fF?!%J $%DEE$%KM rE   fully_sharded_moduleparam_init_fnsync_module_statesc                 r    t        | j                  |       t        | j                   j                        }t        | j                   j                        \  }}|s|r|t        || j                         nA|r#t        || j                   j                         n|rt        j                  | fd        j                  D 	ch c]  }|j                         D ]  }	|	  }
}}	t        | j                  |
|       t        | j                  | j                   j                         _        t        t!        | j                              }t#        ||       |r@t%        || j&                          j(                  t*        v rt%        || j,                         t/         ||        S c c}	}w )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.c                 >    t        |       d u xr | j                  vS rX   )r   r   )	submoduler+   s    rC   <lambda>z0_init_param_handle_from_module.<locals>.<lambda>P  s(    '=i'HD'P (8!7!77 rE   )check_fn)_check_single_device_moduler   _get_device_from_device_idr9   rL   _need_to_materialize_moduler   _materialize_with_param_init_fn_materialize_meta_moduler(   materialize_modulebuffers_move_module_to_device_get_compute_devicecompute_devicer   r   r   _sync_module_params_and_buffersr,   r-   r5   r<   _init_param_handle_from_params)r+   r   r   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_moduler   ignored_buffersmanaged_paramss   `           rC   _init_param_handle_from_moduler  1  s      4e6K6KYW65::u33 3Ne33U5K5K3/N/ 	5=;T' -1G1G	
 
  ""  		
 
%(( 8	
 $44$,,.  	O  	 /

E *+?AVAVWXN/@' .%2E2E	
 ""&@@+$ne6J6J #5.:NOL?s   F3c                    t        |      dk(  ry t        ||| j                  t        | j                     | j
                  j                  | j                  j                  | j                  j                  | j                  j                  | j                  | j                  | j                        }|j                          | j                  rt!        d      | j"                  j%                  |j&                         || _        || j(                  |j*                  <   t-        j.                  d      }| j
                  j                  r,|j&                  j.                  |k7  r|j1                  |       y y y )Nr   )fsdp_extensionz!Expected state._handle to be Noner   )ra   r   r  SHARDING_STRATEGY_MAPr-   r   offload_paramsr   param_dtypereduce_dtypekeep_low_precision_gradsr,   r   r   shardr   rs   r   append
flat_paramr   _fully_sharded_moduler   r   flat_param_to)r+   r   r   handle
cpu_devices        rC   r  r  v  s+    6{ae556(())**66,,F LLN}}@AA	LL))*EMJPE))&*F*FGe$J''F,=,=,D,D
,RZ( -S'rE   root_moduler   c           	      2   d}	 |t        |      n	t               }|D ]V  }t        |t        j
                  j                        st        |dt        |       z         t        |      sMt        d       | j                         D ])  }t        j                  |      r|j                  |       + |D ch c]3  }|j                         D ]  }t        |t        j                        s|  5 }}}| |v rt        j                   d d       | j                         D ]B  }t        |      }	|	t#        |	d	      st%        d
      |j'                  |	j(                         D |S # t        $ r }t        |dt        |       z         |d}~ww xY wc c}}w )ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP moduleszTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: rV   r   r   z?Expected optional_fsdp_state to have _ignored_modules attribute)set	TypeErrorrO   rY   r   r   r   r   r4   modulestraversal_utils_composableadd	fsdp_fileFullyShardedDataParallelr   r   hasattrrs   updater   )
r%  r   
msg_prefixignored_root_moduleser~   childr   r  optional_fsdp_states
             rC   r   r     s    RJQ%5%AC !su 	
 ' R&%((//2J+DT&\N)SSTT!&) PQQR %%' -**62 $$V,- +^^% %!C!CD 	O  o%228; 		
 !((* I	4Y?*.0BC$U  ""#6#G#GHI Q  Q
x5E0F/G%HHIqPQ$s   E' 78F'	F0FFr   c                    t               }|D ch c]%  }|j                         D ]  }t        |      r| ' }}}|j                  |       |,|D ch c]  }t        |      r| }}|j                  |       | j	                         D ]B  }t        |      }	|	t        |	d      st        d      |j                  |	j                         D |S c c}}w c c}w )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    r   z>Expected optional_fsdp_state to have _ignored_params attribute)	r'  
parametersr   r0  r)  r   r/  rs   r   )
r%  r   r   all_ignored_paramsmpparams_in_ignored_modulesparams_in_ignored_parametersr  r5  s
             rC   r   r     s    36% #!ALLN!'(BTUVBW!	! ! 78%)(
1CA1FA(
$ (
 	!!">? !((* K	4Y?*.0AB$T  %%&9&I&IJK -!(
s   #CCC%Cc           	         t               }|D ch c]  }|j                         D ]  }|  }}}|j                  | j                         D ch c]  \  }}||v rt	        |       c}}       | j                         D ]B  }t        |      }|t        |d      st        d      |j                  |j                         D |S c c}}w c c}}w )z6Return the cleaned buffer FQNs in ``ignored_modules``.r   zDExpected optional_fsdp_state to have _ignored_buffer_names attribute)
r'  r
  r0  r   r   r)  r   r/  rs   r   )	r%  r   all_ignored_buffer_namesr9  r   buffers_in_ignored_modulesr   r  r5  s	            rC   r   r     s    
 *- ("aiik",2"" " ## (3'@'@'B	
#V33 k*	
 !((* W	4Y?*.0GH$Z  %++,?,U,UVW $#-"
	
s   CC
c                 f    | j                         D ch c]  \  }}t        |       c}}S c c}}w )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.)r   r   )r%  r   rh   s      rC   r   r     s5     >I=V=V=X+9;+&  s   -c                     t        | |      D ch c]  }|j                   }}t        |      dk(  r%t        j                  d      |v r|t	        d      yt        |      dkD  rt	        d|       yc c}w )z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    rV   r   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.rG   z;FSDP only supports single device modules but got params on )r   r   ra   r   r   )r~   r   r   r   devicess        rC   r  r    s     *:&.)QRu||RGR 7|qU\\%0G;5  
 
W	I'S
 	
 
 Ss   A4r9   device_handlec                 b   | yt        | t        j                        r| nt        j                  |       }|j                  dk7  rk|j                  _t        j                  d|  d| d|j                          d|j                   d	d	       t        j                  |j                               }|S )
z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.rV   r   )rY   r   r   rO   indexr   r   current_device)r   r9   rC  r   s       rC   r  r  =  s     	5<<8	ell9>U  {{e 409f 00=0L0L0N/O PCCI;;- P11 	
 m::<=MrE   c                    t        t        | |            }t        d |D              }| j                         D ]-  }||v r|j	                  d      D ]  }||j
                  z  } / | xr t        xr t        d |D              }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c              3   4   K   | ]  }|j                     y wrX   )is_metar\   r   s     rC   r^   z._need_to_materialize_module.<locals>.<genexpr>h  s     C5Cs   Frecursec              3   F   K   | ]  }t        j                  |        y wrX   )r)   is_fakerJ  s     rC   r^   z._need_to_materialize_module.<locals>.<genexpr>t  s     @U#@s   !)r   r   anyr)  r
  rI  _TORCHDISTX_AVAIL)r~   r   r   r  r  r  bufr  s           rC   r  r  [  s     *6>BCNCNCCN ^^% *	'$$U$3 	*Cckk)N	**  	A	A@@@  
 666rE   c                     t        |      st        d| dt        |             t        | |      }|D ]
  } ||        y )Nz	Expected z to be callable but got )callabler4   rO   _get_modules_to_materialize)r%  r   r   modules_to_materializer~   s        rC   r  r  y  sV    
 M"&>tM?R>ST
 	
 9oV( frE   r  c           	      >   |xs# t        j                  |j                               }t        | |      }d }	 t        j                         5  |D ]u  }t        j                  |j                  d      |j                  d            }t        t        |            dkD  }|sS|j                  |d       |j                          w 	 d d d        y # 1 sw Y   y xY w# t        $ r7}	t        j                  dt!        |	       dt#        |       dd	       |	d }	~	ww xY w)
NFrK  r   )r   rL  zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.rV   r   )r   r   rF  rT  no_grad	itertoolschainr7  r
  ra   r   to_emptyreset_parametersBaseExceptionr   r   r   rO   )
r%  r  r   rC  materialization_devicerU  r~   module_state_iterhas_module_statesr3  s
             rC   r  r    s$    3 ell$$&7 9oVF ]]_ 	.0 . %.OO%%e%4NN5N1%!
 %(->(?$@1$D!$OO+A5OQ++-.	. 	. 	.  !!$Q )L>!KM 		
 s<   C AC!%CC CC C 	D%2DDc                 "   g }t        j                  | g      }| h}|rq|j                         }|j                  |       |j	                         D ]:  }||vst        |      ||vs|j                  |       |j                  |       < |rq|S rX   )collectionsdequepopleftr  childrenr   r,  )r%  r   rU  queuevisited_modulesr~   child_modules          rC   rT  rT    s    
 /1{m,E'2mO
%%f-"OO- 	+LO3*<8@ 7##L1\*	+  "!rE   r  c                    t        j                  d      |	t        j                         }|j	                  |        g }g }|r|j                         }|j                  fd|j                  d      D               |j                  fd|j                  d      D               |j                         D ].  }t        |t        j                        r|j	                  |       0 |r|D 	cg c]	  }	|	|vs|	 }
}	|D 	cg c]	  }	|	|vs|	 }}	t        |
||       yt        t        | |      d      }||j                  k(  rt!                yyyc c}	w c c}	w )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r   Nc              3   @   K   | ]  }|j                   k(  r|  y wrX   r   )r\   r   r$  s     rC   r^   z)_move_module_to_device.<locals>.<genexpr>  s%      <<:-    FrK  c              3   @   K   | ]  }|j                   k(  r|  y wrX   rj  )r\   r   r$  s     rC   r^   z)_move_module_to_device.<locals>.<genexpr>  s%      ==J. rk  )r   r   ra  rb  r  rc  extendr7  r
  rd  rY   r-  r.  _move_states_to_devicenextr   _warn_cpu_init)r~   r   r  r  re  r   r
  curr_moduler  r:  params_to_movebufs_to_mover   r$  s                @rC   r  r    s\   $ e$J( /:.?.?.AV%'&(--/K
 MM (33E3B 
 NN )11%1@ 
 )113 ,	!)Y-O-OPLL+,! & &,Gq/F!GG#*Gaa.FGG~|=RS!&.94@EU\\Z7 8 HGs   1	E;E	EEr
  c                 6   t        |       dk(  rt        |      dk(  ryt        |       dkD  r| d   j                  }nt        |      dkD  r|d   j                  }t        j                  d      }|| D ]k  }t        j                         5  |j	                  |      |_        |j                  *|j                  j	                  |      |j                  _        ddd       m |D ]  }|j	                  |      |_         y|k(  rt                yy# 1 sw Y   xY w)z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr   )ra   r   r   rW  todatagradrp  )r   r
  r  rF  r$  r   r   s          rC   rn  rn    s    6{aCLA-
6{Q))	W	 **e$J(  	KE K"XX&;<
::)&+jjmm4I&JEJJOK K	K
  	;F ))$9:FK	;	:	% 
&K Ks   	ADD	c                  2    t        j                  dd       y )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.rV   r   )r   r    rE   rC   rp  rp    s    MM	1 rE   c                     t        t        | |      d      }|&|j                  j                  dk7  r|j                  }n#t	        j                  |j                               }|||k7  rt        d| d| d|       |S )a)  
    Determine and return this FSDP instance's compute device.

    If the module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA or CUDA-like device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr   z4Inconsistent compute device and `device_id` on rank z: z vs )ro  r   r   rO   r   rF  r4   )r~   r   r  r9   rC  r   r  s          rC   r  r  %  s    * !&.94@EU\\..%7m&B&B&DE(^?T-TB4&d#8"9;
 	
 rE   c                 ~   g }| j                         D ]  }t        |t        d      rt        |t        d       |j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }	}|j                  |	       |j                  |        |D ]l  }
|
j	                         }t        |      r>|j                         \  }}|D cg c]  }t        ||       }}|j                  |       \|j                  |       n t        |       t        ||t        d       yc c}w c c}w )z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTr   )srcN)r
  getattrFSDP_SYNCEDsetattrdetachr&   __tensor_flatten__rm  r  +_check_module_states_for_sync_module_statesr%   PARAM_BROADCAST_BUCKET_SIZE)r~   r   r,   module_statesr   detached_bufferattrsrh   attrinner_buffersr   detached_paraminner_paramss                rC   r  r  H  s*    )+M.." 6v{E2FK.$mmoO,_= +==?qLQ RD$!? R R$$]3$$_56  1(8%88:HE1FKLdGND9LLL  .  01 0>#	 !S Ms   +D5D:r  c                 D    | rt        d | D              rt        d      y y )Nc              3   `   K   | ]&  }|j                   t        j                   d       k(   ( yw)r   N)r   r   )r\   tensors     rC   r^   z>_check_module_states_for_sync_module_states.<locals>.<genexpr>w  s'      17e,,s   ,.zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)rO  r4   )r  s    rC   r  r  t  s7      ;H  C
 	
}rE   c              #      K   | j                         }	 	 t        |      }||vrt        |      s| # t        $ r Y yw xY ww)aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    N)r7  ro  r   StopIteration)r~   r   	param_genr   s       rC   r   r     sT      !!#IOEN*3Ee3L   s   A 4 	A AA  Ac           	          t        |       D ]A  \  }}||vst        |      rt        d| d|j                          d|j                          y)a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r:   	__class__)fsdp_moduler   r   r   s       rC   _check_orig_params_flattenedr    s^     ?{K 
E&/A%/H2:,b::<.%//!24 rE   c                 h    | t         j                  k(  rt        j                  S t        j                  S rX   )r    r   r   allreduce_hookreduce_scatter_hook)r-   s    rC   _get_default_comm_hookr    s3      0 9 99 	$$ ..rE   c                 .    t        j                  |       S )NrH   )r   r=   rH   s    rC   rP   rP     s     %%MBBrE   rX   )ra  rX  r   r   collections.abcr   r   r   r   typingr   r   r	   r
   r   r   torch.distributeddistributedrZ   (torch.distributed.fsdp._exec_order_utilsfsdp_exec_order_utilsr   'torch.distributed.fsdp._traversal_utils_traversal_utilsr*  2torch.distributed.fsdp.fully_sharded_data_parallelfully_sharded_data_parallelr-  torch.nnr   (torch.distributed.algorithms._comm_hooksr   torch.distributed.device_meshr   "torch.distributed.distributed_c10dr   $torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   "torch.distributed.fsdp._flat_paramr   r   r   r   %torch.distributed.fsdp._limiter_utilsr   torch.distributed.fsdp.apir   r   r   r   r   r    r!   r"   torch.distributed.fsdp.wrapr#   &torch.distributed.tensor.parallel.fsdpr$   torch.distributed.utilsr%   torch.utils._python_dispatchr&   torch.utils.hooksr'   rP  
torchdistxr(   r)   ImportErrorr  r~  r`   r[   HybridShardProcessGroupTypeProcessGroupTyper   r   SHARD_GRAD_OPHYBRID_SHARD_HYBRID_SHARD_ZERO2r  r5   #NO_RESHARD_AFTER_FORWARD_STRATEGIESrD   r6   boolrN   rI   intri   r|   rK   r   r   r   r   r   r'  r   r   r   r   r   r   r   r   r   r  r  r   r   r   r   r   r  r  r  r  r  rT  Tensorr  rn  rp  r  r  r  r   r  r  r=   rP   ry  rE   rC   <module>r     s
     	  C C E E    C C A A F F  B 4 A    B	 	 	 0 D < F 1 . 0 #D$5$5t7H7H$HI E$"3"35P"PQR 
 5>>!7!B!B""$:$H$H!!#9#F#F((*@*T*T  !!(( 
 ""((' #  )-00#0 (0 W	0
 *%0 0 0f ((#( ( 	( (V # $   IJ I4 I I  ARAR   "++"" 
" "J++ 4d///0&  	++II+ huxx78+ %((,,-.%((//9R0SS	+ + +\I9=	: --II- %- c5<</01	-
 - -` II  " ?? 01? n-? *%	?
 ? ? !?  ? ? ?D    		'	 	 		 	 : J *   *   299 d2<<6H T ( AA))A c5<</01A Hbii[$%678	A
 A A AH ))) ))) )>::x89: 	^:@ BF"")" !%((*<*<!=>" 				"J$$)$ 	X$@299 S 
II
%
 c5<</01
 
	
<c5<</01
 % ell	<7II7%7 ^7 4:	7<RYYK-. ^ 
	###ELL1# ^# %	#L""-0^"	"))_",3II3%3 &3 $ELL1	3
 
3l%,, $ELL1 
	@	II% $ELL1 	
 % \\F)II)) $$) 
	)X

%

	

II% bll,% 
(.> C$$CCw#  s   ^ ^^