
    sgT                        d dl Z d dlZd dlZd dlZd dlZd dlmZmZmZm	Z	m
Z
mZmZ d dlZd dlmZ d dlmc mc mZ d dlmc mc mZ d dlmZmZ d dlmZmZmZmZm Z  d dlm!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z'm(Z( d d	l)m*Z* d d
l+m,Z,m-Z-m.Z. erd dl/m0Z0 	 d dl1m2Z3 ejl                  jn                  Z7 ejp                  e9      Z:dejv                  jx                  dee=df   dee>e=f   de=fdZ?dejv                  jx                  dee=df   dee>e=f   de@fdZAdejv                  jx                  dee=df   dee>e=f   ddfdZB G d d      ZCy# e4$ r	 d dl1m5Z3 Y w xY w)    N)castDictListOptionalSequenceTupleTYPE_CHECKING)DTensorSpec
TensorMeta)_is_inplace_op_is_out_variant_opOpInfoOpSchemaOutputSpecType)is_rng_supported_mesh)redistribute_local_tensor)ShardingPropagator)convolution_backward_handlerconvolution_handler)try_find_mesh_from_args)Partial	Placement	Replicate)
DeviceMesh)_cxx_pytree)_pytreeop_callargs.kwargsreturnc                 P     | j                   |i |}|t        ur|S t        d      )z
    Decomposes a op to core ATen op, this handler is mostly here
    for inference mode usage where the ops are not core aten ops.
    zDecomposition failed)	decomposeNotImplementedRuntimeError)r   r   r   rs       U/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/tensor/_dispatch.pydecompose_handlerr'   ,   s4     	4*6*A122    c                     t        t        j                  |d         }t        t        j                  |d         }|j                  |j                  k(  S )Nr      )r   torchTensorshape)r   r   r   lhsrhss        r&   is_same_size_handlerr0   <   s?    
 u||T!W
%C
u||T!W
%C99		!!r(   c           	      t   t         j                  j                  j                  | ||      }t	        j
                  t        t        t           |j                        |j                        }t        t        t        df   |      } | |i |j                  }t        t        t         j                     |d         d   }|j                  }|j                  }g }	|D ]>  }
t!        |
t"              r|	j%                  |
       %|	j%                  t'        d             @ t        t(        j*                  |d         }t-        |t/        |	      t1        |j3                         |j5                         |j6                              }t        j                  ||d      }|j9                         }|j;                  |       y )	N.r   maxr*   r-   stridedtype)mesh
placementstensor_metaF)local_tensorspecrequires_grad)dtensorDTensor_op_dispatcherunwrap_to_op_infopytreetree_unflattenr   r   object
local_argsargs_tree_specr   local_kwargslistr7   device_mesh
isinstancer   appendr   r+   r,   r
   tupler   sizer4   r5   full_tensorcopy_)r   r   r   op_infolocal_tensor_argslocal_resultsgrad_dtensorgrad_placementsr6   found_inf_placements	placementtarget_tensorr:   found_inf_dtensor	found_infs                  r&   found_inf_reduce_handlerrX   F   s|   
 oo,,>>wfUG--T&\7--.0F0F U63;/1BC.G'2F2FGMW__-tAw7:L"--O##D,.$ 8	i+ ''	2 ''7	8 tAw/M-.$$& '')%%
D  "U "--/I	"r(   c                      e Zd ZdZddZdej                  j                  dee	df   de
ee	f   de	fd	Zed
ededdfd       Zdej                  j                  dee	df   de
ee	f   defdZede	dede	fd       Zdej                  j                  dej(                  dddefdZdej                  j                  dddddefdZy)OpDispatchera  
    Op dispatching class instance to handle args/kwargs pre-processing (un-wrapping), sharding
    propagation, redistribute local args, local compute, and post-processing (re-wrapping). It
    also handles any op specific logic if necessary.

    NOTE: Given the runtime overhead of Tensor subclass (__torch_dispatch__), the OpDispatcher
    is designed to minimize the CPU overhead by using the tricks of proper unflattening, faster
    pytree if needed, and leveraging various caching mechanisms implemented in the sharding
    propagation and redistribute modules. The CPU overhead is critical to eager mode performance,
    one need to carefully measure the CPU overhead when making significant changes to the
    OpDispatcher and ShardingPropagator.
    r    Nc           
      l   t               | _        t        j                  j                  t        j
                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  t        j                  j                  h
| _        t        j                   j                  t"        t        j$                  j                  t&        t        j(                  j                  t*        t        j,                  j                  t.        t        j0                  j                  t2        i| _        d| _        y )NF)r   sharding_propagatoratennative_dropoutdefaultnormal_	rand_like
randn_likerandint_like	low_dtypelow_dtype_outuniform_	bernoulli
bernoulli_float_random_opslinearr'   is_same_sizer0   convolutionr   convolution_backwardr   *_amp_foreach_non_finite_check_and_unscale_rX   _custom_op_handlers_allow_implicit_replication)selfs    r&   __init__zOpDispatcher.__init__|   s   #5#7 ''LL  NN""OO##%%''++MM!!NN""OO!!
 KK!2%%';$$&9%%--/K;;CCE]$
  ,1(r(   r   r   .r   c                 d
   || j                   v r | j                   |   |||      S | j                  |||      }t        j                  d|j                         | j
                  j                  |       |j                  }t        j                  d||       |J d       |j                  }|j                         |j                  r*|j                  J | j                  ||j                         |j                  r?t        j                  t!        t"        t$           |j&                        |j                        n|j&                  }t!        t(        t$        df   |      }|| j*                  v rt,        j.                  s3t1        |      r(t-        j2                  |j4                        t,        _        t!        t6        j8                  |d         t!        t:        j<                  |d         }	}t,        j.                  r5|	j>                  s)t,        j.                  jA                  |jB                        ntE        jF                         }
|
5   ||i |jH                  }ddd       n ||i |jH                  }n|jJ                  }|j                  jL                  jN                  jP                  }|d}ndtR        dt:        j<                  fd	}tU        |tR              r	 ||      }nftU        |tV              rV|D cg c]  }| ||      nd }}tU        |t"              sJ d|v r'tY        |d   jZ                        }t]        d
| d      |jJ                  |t^        j`                  jb                  k(  r{te        tg        jh                               D cg c]  }d }}tg        jj                  |       tm        to        d |            }tq        jr                  tt        jv                  |d      }ty        |      r|jJ                  |d   S yt{        |      rtU        |jJ                  t|              s|jJ                  fn|jJ                  }g }d}|jN                  j~                  D ]d  }|j                  st!        t6        j8                  ||j                           }t!        tR        ||         |_!        |j                  |       |dz  }f t        |      dk\  sJ d       t        |      dkD  rt}        |      S |d   S | j                  |jJ                        S # 1 sw Y   xY wc c}w c c}w )z(
        Main dispatching logic
        zDispatching op_call: %szoutput_sharding for %s: %sNz"output sharding should not be None.r   r:   r    c                    | j                   h| j                   j                  }| j                   j                  }t        |      dk(  rt	        j
                  d|      S t	        j                  g |      S t        |  d      )Nr    )r5   z has no tensor metadata.)r8   r-   r5   lenr+   zerostensorr$   )r:   r-   r5   s      r&   default_tensorz-OpDispatcher.dispatch.<locals>.default_tensor   sr    ''3 $ 0 0 6 6 $ 0 0 6 6u:?#(;;r#?? $)<<%#@@*dV3K+LMMr(   zreturn type z in DTensor op is not supportedc                 
    | d uS Nrv   )xs    r&   <lambda>z'OpDispatcher.dispatch.<locals>.<lambda>  s
    $ r(   Tr*   z,out variant should have at least one out arg)Erp   r?   loggerdebugschemar\   	propagateoutput_shardingr6   get_coordinateneeds_redistributeredistribute_schemaredistribute_local_argsrD   r@   rA   r   r   rB   rC   r   rj   random_rng_trackerr   OffsetBasedRNGTrackerdevice_typer<   r=   r+   r,   is_meta_distribute_region_spec
contextlibnullcontextrE   output_specop_schemareturnsr
   rH   r   strtypeNotImplementedErrorr]   equalr_   rangedistget_world_sizeall_gather_objectrF   filter	functoolsreduceoperatorand_r   r   rJ   	argumentsis_outnamerI   rw   wrap)rr   r   r   r   rN   r   r6   rO   	first_argfirst_local_argrng_contextrP   r:   ret_listrz   sret_type_obj_listoutput_specsout_dtsspec_idxargumentout_dts                           r&   dispatchzOpDispatcher.dispatch   s    d...44++G4WdFKK (($?.?  **73!1117OL*P,PP*|| ,11 '::FFF,,_@@ )) %%fw'9'9:G<R<R ''  !%U63;%79J K$*****/DT/J +1*F*FtGWGW*XF'-1'//47-KTLL"3A"6N?	
 **?3J3J ''::9??K#//1  ! X$+->$W'BVBV$WMX X !(): Sg>R>R S #..D~~((0088H| !%N N N dK0$24$8Mh/ OS%IJQ]q)D%M % &mT:::},#&x{'7'7#81*8*4ST  &&.$**,,, +00C0C0E*FGQDGG&&x?'> IJ ) 0 0$ O'"**6Aw( "/"="=uE !,,.$00 
 GH#OO55 "??!'//6(--3HIF#'\(5K#LFLNN6*MH" w<1$T&TT$%(\A%55>E71:E99]O,G,GHHkX XN% Hs   	T:T(	T-T%rN   suggested_input_schemac                    | j                   )t        t        j                  |j                              }n|j                  }g }t        | j                        D ]  \  }}||   }t        |t              r]t        t        j                  | j                  |         }||k7  rt        |||      }|j                  |       f|j                  |       x|j                  |        t        |      | _        y r|   )rD   rJ   r@   tree_leavesargs_schema	enumerateflat_args_schemarH   r
   r   r+   r,   rC   r   rI   )	rN   r   flatten_args_schema_to_reshardnew_local_argsiarg_specreshard_arg_specr9   resharded_local_tensors	            r&   r   z$OpDispatcher.redistribute_local_args*  s     !!--2""#9#E#EF.* .D-O-O*')$W%=%=> 	8KAx=a@(K0#ELL'2D2DQ2GH//-F$h0@.* #))*@A")),7%%&67	8 #>2r(   c           
      N   | j                   j                  j                  |d       }|'|j                  rt	        j
                  |      \  }}|}n|d }}g }i }	g }
i }d }|D ]  }t        |t        j                        ry|
j                  |j                         |4||j                  k7  r%| j                  |||      }|j                  |       o|j                  }|j                  |j                         t        |t        j                        rD|xs t!        ||      }|j                  | j#                  |||             |
j                  |       |j                  |       |
j                  |        |j%                         D ]  \  }}t        |t        j                        rU|j                  ||<   |(||j                  k7  r| j                  |||      }||	|<   Y|j                  }|j                  |	|<   ut        |t        j                        r,|xs t!        ||      }| j#                  |||      |	|<   |||<   ||	|<   |||<    |J d| d       t'        |t)        ||rt	        j*                  ||      n
t-        |      |	|      |t-        |
      ||      }|S )Nz*found no DeviceMesh from dtensor args for !)schema_info)r\   op_to_schema_infogetneeds_pytreer@   tree_flattenrH   r<   r=   rI   _local_tensorrG   *_try_replicate_dtensor_spec_in_missing_dimr   r+   r,   r   %_try_replicate_spec_for_scalar_tensoritemsr   r   rA   rJ   )rr   r   r   r   runtime_schema_info	tree_args	args_spec	args_listr   kwargs_schemarC   rE   r6   argr:   kvrN   s                     r&   r?   zOpDispatcher.unwrap_to_op_infoH  s    #66HHLLT
 */B/O/O#)#6#6t#< Iy*3I#'yI$&+-#%
*,%) 	'C#w/!!#"3"34#(?  JJdD  &&t,??D&&syy1C.J6w	J"">>wTR !!#&""3'!!#&1	'4 LLN 	$DAq!W__-"#//Q#(=JJDD (,M!$==D'(wwM!$Au||,J6w	J#'#M#MQ$a  #$Q#$a "#Q'	$* X#MgYVW!XX %%k9=;'/ *
 r(   resr:   c                 
   t        | t        j                        rW|=t        |t              sJ d| d       t	        j
                  | || j                        S | j                  dk(  sJ d       | S t        | t        t        f      r{|t        |t        t        f      sJ d| d       g }t        | |      D ]*  \  }}|j                  t        j                  ||             , t        | t              rt        |      S |S | S )NzBoutput spec does not match with output! Expected DTensorSpec, got .)r;   r   zoutput tensor should be scalar!zAoutput spec does not match with output! Expected list/tuple, got )rH   r+   r,   r
   r<   r=   r;   ndimrF   rJ   ziprI   rZ   r   )r   r:   res_lister   s        r&   r   zOpDispatcher.wrap  s   c5<<(!+ `WX\W]]^_`  sD@Q@QRR xx1}G&GG}
dE]+#
tUm) [RSWRXXYZ[  HC 91 1 1!Q 789 '1e&<5?J(J Jr(   
tensor_argr6   r   c           	      r   |j                         dk(  r$|j                  dk(  rt        j                  d       |j                         dk(  s| j                  rTt        |t               f|j                  z  t        |j                  |j                         |j                              }|S t        | d      )Nr*   zFound a non-scalar tensor with numel=1 and ndim!=0, we are implicitly creating a replicated DTensor for it. However, please consider changing it to a scalar tensor or explicitly create a DTensor under distributed enviroment.r3   r8   zw: got mixed torch.Tensor and DTensor, need to convert all torch.Tensor to DTensor before calling distributed operators!)numelr   warningswarnrq   r
   r   r   r-   r4   r5   r$   )rr   r   r   r6   replication_specs        r&   r   z2OpDispatcher._try_replicate_spec_for_scalar_tensor  s     "z!';MMO "d&F&F**&$**%,,.$**   	 ) Q Q r(   dtensor_argzdtensor.DTensorc           	         ddl m} |j                  }|j                  |      }| j                  rd|j
                  v r||k(  rt        |j                        D cg c]  }t                }}|j                  |      }	|j                  d   ||	<   t        |t        |      t        |j                  |j                         |j                               }
|
S t#        | d| d|       c c}w )Nr   )_mesh_resourcesforeachr3   r   zA: DTensor does not support cross-mesh operation yet! Got meshes:  )torch.distributed.device_meshr   rG   get_root_meshrq   __name__r   r   r   get_root_mesh_dimr7   r
   rJ   r   r-   r4   r5   r   )rr   r   r   r6   r   cur_mesh	root_meshr   r7   cur_mesh_root_idxreplicate_specs              r&   r   z7OpDispatcher._try_replicate_dtensor_spec_in_missing_dim  s     	B**#11(;	,,W---T!/4Y^^/DE!)+EJE / A A( K,7,B,B1,EJ()(j!&%++&--/%++N 	 &) #fAhZ1  Fs   C*)r    N)r   
__module____qualname____doc__rs   r+   _ops
OpOverloadr   rB   r   r   r   staticmethodr   r   r   r?   r   r   r,   r
   r   r   rv   r(   r&   rZ   rZ   n   sq   18PI&&PI FCK PI S&[!	PI
 
PId 33 (3 
3 3:W&&W FCK W S&[!	W
 
Wr &  6  2 &&  LL  	 
 
 B"&&" '" 	"
 
"r(   rZ   )Dr   r   loggingr   r   typingr   r   r   r   r   r   r	   r+   torch.distributeddistributedr   torch.distributed.tensor._apiry   _apir<    torch.distributed.tensor._random_randomr   &torch.distributed.tensor._dtensor_specr
   r   #torch.distributed.tensor._op_schemar   r   r   r   r   r   &torch.distributed.tensor._redistributer   'torch.distributed.tensor._sharding_propr   !torch.distributed.tensor._tp_convr   r   torch.distributed.tensor._utilsr   (torch.distributed.tensor.placement_typesr   r   r   r   r   torch.utilsr   r@   ImportErrorr   opsr]   	getLoggerr   r   r   r   rB   r   r'   boolr0   rX   rZ   rv   r(   r&   <module>r     s{        M M M    / / 1 1 J  C L F D R R 8.1 yy~~			8	$3ZZ""3

3 f3 	3 "ZZ"""

" f" 
	"%#ZZ""%#

%# f%# 
	%#PP PS  .-.s   E	 	EE