
    qi                        U d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlZd dlmc mZ d dlmc mc mZ d dlmZ d dlmZmZmZmZ d dlmZ d d	l m!Z!m"Z"m#Z#m$Z$m%Z% d d
l&m'Z'  ejP                  e)      Z*da+ee,   e-d<   e j\                  d'de,fd       Z/ G d de      Z0i Z1e2e3ejh                  e5f   df   e-d<   dede5ddfdZ6d(dZ7 G d d      Z8	 d)dedede,dz  de9e0   fdZ:e
	 d)dedede,dz  de9e0   fd       Z;dddddejx                  d ed!ed"e,d#e,de,dz  dejx                  fd$Z= G d% d&ej|                  j~                        Z@y)*    N)defaultdict)Sequence)cache)cast
NamedTupleOptional)_are_we_tracing)DTensorSpec
ShardOrderShardOrderEntry
TensorMeta)
DeviceMesh)_StridedShardPartial	Placement	ReplicateShard)get_active_debug_mode#_FORCE_MIN_COST_REDISTRIBUTION_PLANenabledc              #   8   K   t         }| a 	 d |a y# |a w xY ww)uX	  
    Context manager to control the redistribution planning strategy for DTensor operations.

    This context manager allows you to choose between two algorithms for computing the
    sequence of collective operations needed to redistribute a DTensor from one placement
    to another:

    - **Graph-based**: Uses Dijkstra's algorithm to find the minimum-cost path
      through all possible placement transformations. This approach considers the global
      cost of all collective operations and finds the optimal sequence. Best for complex
      redistribution patterns where reducing communication cost and memory overhead is critical.

    - **Greedy**: Uses a heuristic approach that makes locally optimal choices
      at each step. This is faster to compute but may not produce the globally optimal
      transformation sequence. Best for simple redistribution patterns or when planning
      speed is more important than optimal communication.

    **Default Behavior (without this context manager):**

    When this context manager is NOT used, the algorithm selection follows this priority:

    1. **Non-default shard orders**
       → Always use graph-based algorithm (required for correctness)

    2. **Explicit `use_graph_based_transform` parameter** to `_gen_transform_infos_non_cached`
       → Use the specified algorithm (True = graph-based, False = greedy)

    3. **No explicit parameter** (default case)
       → Use greedy algorithm for faster planning

    **Behavior with this context manager:**

    This context manager overrides the default selection by setting the global flag
    `_FORCE_MIN_COST_REDISTRIBUTION_PLAN`, which takes precedence over the explicit
    `use_graph_based_transform` parameter (but not over non-default shard order requirements).

    **Cache Considerations:**

    The redistribution planner caches transform info for performance via the `@cache`
    decorator on `_gen_transform_infos`. If you need to change the algorithm selection
    for the same input specs, clear the cache using `_gen_transform_infos.cache_clear()`
    to ensure the new setting takes effect and doesn't reuse cached results from a
    previous run.

    Args:
        enabled (bool): If True, forces the use of the graph-based algorithm.
                       If False, forces the use of the greedy algorithm.
                       Default: True
    N)r   )r   	old_values     l/home/ubuntu/crypto_trading_bot/.venv/lib/python3.12/site-packages/torch/distributed/tensor/_redistribute.py use_min_cost_redistribution_planr   +   s(     h 4I*1'8.7+i+s   	 c                   <    e Zd ZU eed<   eeef   ed<   ee   ed<   y)_TransformInfomesh_dimsrc_dst_placementslogical_shapeN)__name__
__module____qualname__int__annotations__tupler   list     r   r   r   g   s!    Mi2339r(   r   DTensorRedistributePlanner_planner_cachedevice_meshtensor_dimensionreturnc                 |    t        j                  |       |f}|t        vrt        | |      }|t        |<   t        |   S )a  
    Factory function to get or create a DTensorRedistributePlanner instance.
    This function provides transparent caching of planner instances based on
    device_mesh and tensor_dimension. Multiple calls with the same parameters
    will return the same cached instance for better performance.
    Args:
        device_mesh: The device mesh for the planner
        tensor_dimension: Number of tensor dimensions
    Returns:
        A DTensorRedistributePlanner instance (potentially cached)
    )weakrefrefr*   r)   )r+   r,   	cache_keyplanners       r   get_redistribute_plannerr3   t   sB     [)+;<I&,[:JK$+y!)$$r(   c                  ,    t         j                          y)z8Clear the cache of DTensorRedistributePlanner instances.N)r*   clearr'   r(   r    clear_redistribute_planner_cacher6      s    r(   c                      e Zd ZdZ ej
                  dd       G d d             Zd Zede	e
ee
   f   defd	       Zedede	e
ee
   f   fd
       Ze	 d)dedee   deedf   dedz  def
d       Zdede
ddfdZ	 	 	 	 	 d*de
de
de
de
de
ddfdZdeedf   dede	de
f   fdZded eded   fd!Zddd"e
d#ee
df   dee
   fd$Zd%ed&ed#ee
df   dee   fd'Zd%ed&edee   fd(Zy)+r)   a  
    This class is used to plan the collective calls to transform the local shard
    of the DTensor from its current spec to the target spec.
    Suppose there are N tensor dimensions and M mesh dimensions, the total
    possible state size will be (N+2)*M*M!.
    Note: Use get_redistribute_planner() factory function instead of direct
    instantiation for automatic caching.
    T)frozenslotsc                       e Zd ZU eedf   ed<   eed<    ej                  dddd      Z	e
dz  ed<   d Zd	 Zd
 Zde
fdZde
fdZdedefdZy)$DTensorRedistributePlanner.DistState.
placementstensor_dim_to_mesh_dimNF)defaultinitreprcompare_hashc                 V    t        j                  | j                  | j                        S N)r
   format_shard_order_strr<   r=   selfs    r   __str__z,DTensorRedistributePlanner.DistState.__str__   s%    55++ r(   c                 "    | j                         S rD   )rH   rF   s    r   __repr__z-DTensorRedistributePlanner.DistState.__repr__   s    <<>!r(   c                 N    t         j                  | d| j                                y )NrB   )object__setattr___compute_hashrF   s    r   __post_init__z2DTensorRedistributePlanner.DistState.__post_init__   s"    ""$r(   r-   c                 R    | j                   | j                   S | j                         S rD   )rB   rN   rF   s    r   __hash__z-DTensorRedistributePlanner.DistState.__hash__   s#    !%!74::QT=O=O=QQr(   c                 D    t        | j                  | j                  f      S rD   )hashr<   r=   rF   s    r   rN   z2DTensorRedistributePlanner.DistState._compute_hash   s$    OO// r(   otherc                     t        |t        j                        sy| j                  |j                  k7  ry| j                  | j
                  f|j                  |j
                  fk(  S )NF)
isinstancer)   	DistStaterB   r<   r=   )rG   rT   s     r   __eq__z+DTensorRedistributePlanner.DistState.__eq__   sa    e%?%I%IJzzU[[(++   ,, r(   )r    r!   r"   r%   r   r$   r   dataclassesfieldrB   r#   rH   rJ   rO   rQ   rN   rL   boolrX   r'   r(   r   rW   r;      sy    )S.)) **-K--u5%
sTz 	
		"		Rc 	R	3 		 	4 	r(   rW   c                 ^     t        |t        t        z        rt         fd|D              S |S )z<Convert a nested list structure to a nested tuple structure.c              3   @   K   | ]  }j                  |        y wrD   )	_to_tuple).0itemrG   s     r   	<genexpr>z7DTensorRedistributePlanner._to_tuple.<locals>.<genexpr>   s     <$-<s   )rV   r&   r%   )rG   xs   ` r   r^   z$DTensorRedistributePlanner._to_tuple   s'    a&<!<<<r(   rb   r-   c                 T    t        d t        | j                               D              S )zConvert dict to ShardOrderc              3   R   K   | ]  \  }}|rt        |t        |              ! yw))
tensor_dim	mesh_dimsN)r   r%   )r_   keyvalues      r   ra   zADTensorRedistributePlanner._dict_to_ShardOrder.<locals>.<genexpr>   s-      
U seElCC
s   %')r%   sorteditems)rb   s    r   _dict_to_ShardOrderz.DTensorRedistributePlanner._dict_to_ShardOrder   s)      
$QWWY/
 
 	
r(   c                 v    t        t              }| D ]$  }t        |j                        ||j                  <   & |S )z1Convert ShardOrder to dict with tensor dim as key)r   r&   rf   re   )rb   tensor_mesh_dim_dictentrys      r   _ShardOrder_to_dictz.DTensorRedistributePlanner._ShardOrder_to_dict   s@      +40 	KE59%//5J !1!12	K##r(   Nmeshtransform_infossrc_placement.src_shard_orderc                 $   t        |      | j                  k(  sJ |t        j                  |      }t	        |      }t
        j                  |      }t
        j                  t        |      |      }|g}|D ]  }|j                  \  }	}
|	j                         r6|	j                  }||v rt        ||         dkD  sJ ||   j                          |
j                         r3|
j                  }||vrg ||<   ||   j                  |j                         |
||j                  <   t
        j                  t        |      t
        j                  |            }|j                  |        dj!                  |D cg c]  }t#        |       c}      S c c}w )a  
        Generate a string representation of the sequence of state transitions
        (placements and shard orders) as described by the given transform_info.

        Args:
            mesh: The DeviceMesh used for the redistribution.
            transform_infos: A sequence of _TransformInfo objects describing each
                transformation step.
            src_placement: The initial tuple of Placement objects.
            src_shard_order: (Optional) The initial ShardOrder representing
                the mapping of tensor dimensions to mesh dimensions. If None,
                the default shard order is computed from src_placement and mesh.

        Returns:
            A string showing the sequence of DistState transitions, separated by '->'.
        r   z->)lenndimr
   compute_default_shard_orderr&   r)   ro   rW   r%   r   is_sharddimpopappendr   rk   joinstr)rp   rq   rr   rs   cur_placementshard_order_dict	cur_state
state_listtransform_infosrc_dim_placementdst_dim_placementsrc_dimdst_dim	new_statess                  r   stringify_transform_infosz4DTensorRedistributePlanner.stringify_transform_infos   s   . =!TYY...")EEmTO]+5II
 /88- /
	 

 . 	)N3A3T3T00 ))++////C8H8Q4RUV4VV )--/ ))++//"2202$W- )001H1HI5FM.1122<<m$*>>?OPI i(%	)& yy*5Q#a&5665s   4Fr+   r,   c                     || _         |j                         | _        | j                  J || _        | j	                          y)z
        Initialize DTensorRedistributePlanner.

        Args:
            device_mesh: The device mesh for this planner
            tensor_dimension: Number of tensor dimensions
        N)r+   get_coordinate
coordinater,   setup_collective_cost)rG   r+   r,   s      r   __init__z#DTensorRedistributePlanner.__init__  s@     '%446*** 0""$r(   all_reduce_costall_to_all_costall_gather_costreduce_scatter_cost
chunk_costc                 J    || _         || _        || _        || _        || _        y)zN
        Set up the cost weights for different collective operations.
        N)r   r   r   reduce_scatterr   )rG   r   r   r   r   r   s         r   r   z0DTensorRedistributePlanner.setup_collective_cost-  s+      /..1$r(   r<   tensor_mesh_dim_tupler;   c                    i }t         j                  |      }|D ]  }|j                  }t        | j                        D ]  }||k(  r	||   j                         }||   j                  |       t        |      }	t        |      |	|<   | j                  | j                  |	      t         j                  |            }
| j                  ||
<   ||   j                  |       ||   j                            |D ]  }|j                  }||   j                         }t        |      }	t               |	|<   | j                  | j                  |	      t         j                  |            }
||   j                  |       | j                  ||
<    t        |      D ]^  \  }}t!        |t"              st        |      }	t               |	|<   | j                  | j                  |	      |      }
| j$                  ||
<   ` t        |      D ]  \  }}t!        |t              st        | j                        D ]  }t        |      }	t        |      |	|<   ||   j                  |       | j                  | j                  |	      t         j                  |            }
| j&                  ||
<   ||   j                            t        |      D ]  \  }}t!        |t"              st        | j                        D ]  }t        |      }	t        |      |	|<   ||   j                  |       | j                  | j                  |	      t         j                  |            }
| j(                  ||
<   ||   j                            t        |      D ]^  \  }}t!        |t              st        |      }	t#               |	|<   | j                  | j                  |	      |      }
| j&                  ||
<   ` |S rD   )r)   ro   re   ranger,   rz   r{   r&   r   rW   r^   rk   r   r   r   	enumeraterV   r   r   r   r   )rG   r<   r   all_next_staterm   rn   src_tensor_dimdst_tensor_dimmove_mesh_dimnew_placements
dist_statesrc_mesh_dim	placementr   s                 r   get_next_statez)DTensorRedistributePlanner.get_next_state?  s   ` KM9MM! 
 + 	;E"--N"'(=(="> ;!^3 !5^ D H H J$^4;;MJ!%j!105n0E}-!^^NN>2.BB,
 .2-A-Az*$^4;;MJ$^488:%;	;4 + 
	>E"--N0@DDFM!*-N,5KN=)~.*>>?STJ !077F)-)=)=N:&
	> (1'< 	>#L)i1!*-N+4;N<(~.0EJ *.)=)=N:&	> $-Z#8 	;Hii3"'(=(="> ;!%j!1+0+@x($^4;;HE!^^NN>2.BB,
 .2__z*$^488:;	;& $-Z#8 	;Hii1"'(=(="> ;!%j!1+0+@x($^4;;HE!^^NN>2.BB,
 .2-@-@z*$^488:;	;& $-Z#8 	9Hii3!*-N'.yN8$~.0EJ *.N:&	9 r(   	src_state	dst_statec                    ddl }d}d|||gfg}t               }|r|j                  |      \  }}}	}
|	|k(  r|
S |	|v r$|j                  |	       | j	                  |	j
                  |	j                        }|j                         D ]0  \  }}||vs||z   }|
|gz   }|dz  }|j                  |||||f       2 |rt        d| d|       )aB  
        Find the min cost path from src_state to dst_state using Dijkstra's
        algorithm.

        Args:
            src_state: The source state
            dst_state: The destination state

        Returns:
            A list of states representing the min cost path from src_state to
            dst_state
        r   N   zNo path found from src_state z to dst_state )
heapqsetheappopaddr   r<   r=   rj   heappushAssertionError)rG   r   r   r   counterpqvisitedcost_current_statepathnext_states
next_statetransition_costnew_costnew_paths                   r   find_min_cost_pathz-DTensorRedistributePlanner.find_min_cost_path  s    	  )i[12 	 %+0==+<(D!]D	)'KK&--((-*N*NK 0;/@/@/B R+
OW,#o5H#zl2HqLGNN2':x'PQR " +I;nYKP
 	
r(   r   full_tensor_shapec                 V   t        |      }| j                  J |j                  D ]  }|j                  }|j                  }t        |      dkD  sJ |D ]Q  }||k(  r	t        j                  ||   | j                  j                  |      | j                  |         d   }	|	||<   S  |S )Nr   r   )
r&   r   r=   re   rf   ru   r   local_shard_size_and_offsetr+   size)
rG   r   r   r   new_logical_shapern   re   rf   mdimnew_sizes
             r   get_logical_shapez,DTensorRedistributePlanner.get_logical_shape  s     !!23***55 	9E))JIy>A%%%! 98# <<%j1$$))4)8OOD) 	
 19!*-9		9 ! r(   src_specdst_specc           	         t        d |j                  D              r.t        j                  |j                  |j                        \  }}n|j                  }|j
                  }t        d |j                  D              r.t        j                  |j                  |j                        \  }}n|j                  }|j
                  }||t        d      | j                  ||      }| j                  ||      }	g }
| j                  ||	      }t        j                  |      D ]  \  }}|j                  |j                  k7  s d}t        t        |j                  |j                              D ]R  \  }\  }}||k7  s|dk7  rt        d      |}| j                  |||      }|
j                  t!        |||f|             T  |
S )Nc              3   <   K   | ]  }t        |t                y wrD   rV   r   r_   r   s     r   ra   zRDTensorRedistributePlanner.generate_graph_based_transform_infos.<locals>.<genexpr>3        
5>Jy-0
   c              3   <   K   | ]  }t        |t                y wrD   r   r   s     r   ra   zRDTensorRedistributePlanner.generate_graph_based_transform_infos.<locals>.<genexpr>>  r   r   zRedistribution of _StridedShard placement is only supported for _StridedShard that can be converted to ordered Shard placements. Full _StridedShard redistribution support is not yet implemented.z@Multiple mesh_dims are different between cur_state and nxt_stater   r   r   )anyr<   r
   &_normalize_placements_into_shard_orderrp   shard_orderNotImplementedErrorrW   r   	itertoolspairwiser   zipr   r   r{   r   )rG   r   r   r   src_placementsrs   dst_placementsdst_shard_orderr   r   rq   
state_pathr   	nxt_stateupdate_mesh_dimr   r~   nxt_placementr   s                      r   $generate_graph_based_transform_infosz?DTensorRedistributePlanner.generate_graph_based_transform_infos+  s     
BJBUBU
 
 BB'' ,NO &00N&22O 
BJBUBU
 
 BB'' ,NO &00N&22O"o&=%T 
 NN>?C	NN>?C	02,,Y	B
$-$6$6z$B 	 Iy##y';';;"$@I	,,i.B.BCA <H<}m %5*b0"0 b#  +3(,(>(>%x1B) (..*)84A=3Q.;		0 r(   c           	         | j                   J t        |j                        }|g}g }| j                  j                  dk(  r;|j                  t        d|j                  d   |j                  d   f|             |S t        |j                        D ]  \  }}||   }t        |t              r|| j                  j                  dz
  k  s8| j                  j                  |      }	|j                  ||j                     |	| j                   |         \  }
}t        |      }|
||j                  <   |j                  |       |j                  |        t        |j                        }t        |j                        }|j                  dkD  rt        t!        t#        |                  D ]  }||   }||   }t        |t              r|j                  }g g }}t        t%        ||            D ]T  \  }\  }}||k\  r nG|j'                  |      r|j                  |       |j'                  |      sD|j                  |       V ||k7  r
t)               }||k7  s|j                  t        |||f||                |||<    t        t%        ||            D ]5  \  }\  }}||k7  s|j                  t        |||f||                |||<   7 |S )a  
        Generate the transform infos from the source placements to the target placements.

        To transform from source to target placement it might have multiple steps, i.e. it
        might decompose Si -> Sj into Si -> R -> Sj.
        This would detect if there're mis-aligned/nested shardings between src/dst placements.
        E.g. Suppose the redistribution to perform is (Shard(0), Shard(0)) -> (Replicate(), Shard(0)),
        in this case Shard(0) -> Shard(0) for mesh dimension 1 actually needs resharding, because in
        the former is a nested-sharding of a tensor already already sharded dimension 0, whereas
        the latter is the first sharding on tensor dimension 0.
        r   r   r   r   )r   r&   shaper+   rv   r{   r   r<   r   rV   r   r   _local_shard_size_and_offsetry   
num_shardsreversedr   ru   r   rx   r   )rG   r   r   initial_logical_shapemesh_dims_to_logical_shaperq   isrccurrent_logical_shapemesh_dim_sizelocal_shard_sizer   r   current_placementstarget_placementsr   currenttarget	shard_dimcurrent_mesh_shardingtarget_mesh_shardingr   ps                          r   generate_greedy_transform_infosz:DTensorRedistributePlanner.generate_greedy_transform_infosm  s   $ *** $X^^ 4&;%<"02  A% ""(0(;(;A(>@S@STU@V'W"7 #"
   3 34 	IFAs$>q$A!#u%t'',,q00$($4$4$9$91$9$EM*-*J*J-cgg6%*+'$a
 )--B(C%1A%cgg..556GH*112GH	I( "("5"56 !4!45" %U3/A+B%CD #:,X6*84 fe, !'

IBDb+?)%..0AB& ;	6Aq =!::i0188;::i0077:; -0DD
 "+f$#**&%-07/@*DX*N 4:&x0G#:P ,5"$56,
 	6'H'w & &&"!),3V+<&@&J 06"8,	6 r(   rD   )   r      r   r   )r    r!   r"   __doc__rY   	dataclassrW   r^   staticmethoddictr#   r&   r   rk   ro   r   r   r   r%   r   r}   r   r   r   r   r   r   r
   r   r   r'   r(   r   r)   r)      s\    [$d3. . 4.` 
tCcN3 

 
 
 $z $d3S	>.B $ $ 
 .2	6767!.167 Y^,67 $d*	67
 
67 67p%% % 
	%(  !  #$%% % 	%
 !% % 
%$_)S.)_  *_ 
4c9	:	_F0
"0
/80
	4	50
d!9! ! !c?	!
 
c!.@@ @ !c?	@
 
n	@Dvv v 
n		vr(   r   r   use_graph_based_transformc                 X   | j                   }| j                  }|j                  }||J t        d ||fD               }|du rd}nt        t        }n|d}t	        |t        | j                              }|r|j                  | || j                        }|S |j                  | |      }|S )Nc              3   F   K   | ]  }t        j                  |        y wrD   )r
   is_default_device_order)r_   orders     r   ra   z2_gen_transform_infos_non_cached.<locals>.<genexpr>  s#      $ 	++E2$s   !TF)	r+   r   allr   r3   ru   r   r   r   )	r   r   r   r+   rs   r   has_non_default_orderdrprq   s	            r   _gen_transform_infos_non_cachedr     s    
 &&K**O**O &?+FFF
 !$ $%7$ ! 
 $$(!	,	8$G!	"	*$)!
";HNN0C
DC BBh

  ==hQr(   c                     t        | ||      S rD   )r   )r   r   r   s      r   _gen_transform_infosr  
  s     +(5 r(   F)async_opis_backwardr   local_tensorcurrent_spectarget_specr  r  c                   |j                   |j                   k7  rt        d      | }|j                   }|j                         }|| S t               rt	        |||      }	nt        |||      }	t               }
|
R|
j                  | |j                  |j                  t        j                  ||	|j                  |j                              nt        j                         }|5  |	D ]  }|j                  }|j                  \  }}|j!                  |      }||k(  r| }9|dk(  r| }A|j#                         r|j%                         r%t'        t(        |      }|j+                  | ||      }n|j-                         r0t'        t.        |      }|j1                  | |||j2                        }nt5        d| d| d      |j-                         rt'        t.        |      }|j%                         r&t'        t(        |      }|j7                  | |||      }n3|j#                         r|j9                  | ||||         }n
|j-                         s
J d|        t'        t.        |      }|j:                  |j:                  k7  r|j=                  | |||j2                  |j:                        }n|j%                         r|j#                         r(t'        t(        |      }|s|j?                  | ||      n| }nU|j-                         rC|st5        d| d| d      t'        t.        |      }|j1                  | |||j2                        }n| }|s*tA        |tB        jD                        r|jG                         }|}  	 ddd       |S # 1 sw Y   |S xY w)	z
    This redistribute the local tensor (torch.Tensor) from the current DTensorSpec to
    the target DTensorSpec, which involves the necessary collective calls to transform
    the local shard of the DTensor from its current spec to the target spec.
    z)Cross device mesh comm not supported yet!Nr   r   zredistribute from z to z not supported yetz,Current placement should be shard but found )$rp   r   r   r	   r   r  r   record_redistribute_callsr<   r)   r   r   
contextlibnullcontextr   r   r   is_replicate
is_partialr   r   _reduce_valuerx   r   _to_replicate_tensorr   RuntimeError_reduce_shard_value_replicate_to_shardry   _to_new_shard_dim_partition_valuerV   funcolAsyncCollectiveTensorwait)r  r  r  r  r  r   new_local_tensorr+   my_coordinaterq   
debug_moderedistribute_contextr   r   r   r   
num_chunkspartial_speccurrent_placementtarget_placement
shard_specs                        r   redistribute_local_tensorr!    s    K,,,!"MNN###K..0M 9+'@
 /+'@
 '(J ! 	,,##""&@@''((		
	
 ##%   
 ],- \	,N''A,??OGV$))1)5J& #/ Q $0 ""$%%'#'#9L'3'A'A$k1($ %%'(,UG(<%'8'M'M$k1n6R6R($ ',WIT&AST  "#'v#6 %%'#'#9L'3'G'G$k16F($ ))+'7'K'K$k1mA6F($ #++- FwiP- "&eW!5J!~~)9)=)==+5+G+G('*88,00,( ""$'')#'#8L  + %55lKQRS) %
 %%'&*0	fXEWX  )-UG(<%'8'M'M$k1n6R6R($
 (4$
 &">">! $4#8#8#: +Ly\	,],| }],| s    JN  N
c                       e Zd Ze	 	 	 ddddedeedf   dedej                  dz  d	ej                  dz  fd
       Z
edd       Zy)RedistributeNinputdtensor.DTensorr+   r<   .r  forward_dtypebackward_dtypec           	      H   || _         || _        |j                  j                  | _        ||||j                  j                  k7  rc|j                  j                  |      }t        ||j                  j                  t        |j                  |j                         |            }n|j                  }|j                  }|| _        |j                  |k7  r(t        |||j                        }	t        |||	|      }
n|}
|}	t        j                   |
|	|j"                        S )Ndtyper   strider*  rp   r<   tensor_metar.  )r  requires_grad)r  r'  _local_tensorr*  original_dtypetor
   _specr<   r   r   r,  r  r.  r!  dtensorDTensorr1  )ctxr$  r+   r<   r  r&  r'  r  r  r  outputs              r   forwardzRedistribute.forward  s     +"0066$%:M:M:S:S)S ..111FL&  ;;11&++ <<>'L !..L ;;L'""j0%Z\5M5MK /lK(F
 "F&K --
 	
r(   c           	      $   | j                   }| j                  }| j                  xs | j                  }||j                  j
                  k7  r|j                  j                  |      }t        |j                  j                  |j                  j                  t        |j                  |j                         |            }t        |j                  |j                  |j                        }n|j                  }|j                  }t        ||||d      }|j
                  | j                  k7  r|j                  | j                        }g }|j                  D ]=  }	|	j!                         r|j#                  t%                      -|j#                  |	       ? t        |j                  t'        |      t        |j                  |j                         |j
                              }
t)        j*                  ||
|j,                        }|d d d d d fS )Nr)  r+  r-  T)r  r  r/  r0  )r  r  r'  r3  r2  r*  r4  r
   r5  r+   r<   r   r   r,  r.  r!  r  r{   r   r%   r6  r7  r1  )r8  grad_outputprevious_specr  r'  r  r  r9  normalized_placementsprevious_placementspecoutput_dtensors               r   backwardzRedistribute.backward  s   ((<<++As/A/A[66<<<&4477n7ML& &&22&,,77&%++&--/(L ("..(33(44M '44L&,,L*
 <<3---YYs112F 24"/":": 	A!,,.%,,Y[9%,,-?@	A %%'("!''"))+ll
 !%33
 
 	
r(   )FNN)r<  r%  )r    r!   r"   r   r   r%   r   r[   torchr*  r:  rB  r'   r(   r   r#  r#    s     ,0-13
 !3
  	3

 )S.)3
 3
 {{T)3
 d*3
 3
j F
 F
r(   r#  )T)r-   NrD   )Ar
  rY   r   loggingr/   collectionsr   collections.abcr   	functoolsr   typingr   r   r   rC  )torch.distributed._functional_collectivesdistributed_functional_collectivesr  torch.distributed.tensor._apitensor_apir6  r	   &torch.distributed.tensor._dtensor_specr
   r   r   r   $torch.distributed.tensor.device_meshr   (torch.distributed.tensor.placement_typesr   r   r   r   r   torch.utils._debug_moder   	getLoggerr    loggerr   r[   r$   contextmanagerr   r   r*   r   r%   ReferenceTyper#   r3   r6   r)   r&   r   r  Tensorr!  autogradFunctionr#  r'   r(   r   <module>rZ     s        # $  - -  : : / / E  <  : 
		8	$ 7; #Xd^ : 88d 88 88vZ   	'


$%'CC 
%%/2%!%.
S	 S	r .2!!!  $d{! 
.	!H  .2  $d{ 
.	  -1U,,UU U
 U U  $d{U \\Up~
5>>** ~
r(   