
    sg@E                        d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
mc mZ d dlmc mZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z" ejF                  jH                  Z$d
gZ%e jL                  d        Z'dee"df   de(de(fdZ)dee"df   dedefdZ*dejV                  jX                  dee-df   dee.e-f   defdZ/d Z0dejV                  jX                  dee-df   dee.e-f   de-fdZ1dejV                  jX                  dee-df   dee.e-f   de-fdZ2dededee   dee   de(de(d ejf                  d!e(ded"e(deeef   fd#Z4dejV                  jX                  dee-df   dee.e-f   de-fd$Z5d%edededee   de(de(d&ed ejf                  d!e(ded"e(defd'Z6dejV                  jX                  dee-df   dee.e-f   de-fd(Z7e$j`                  jp                  e1e$jr                  jp                  e2e$jt                  jp                  e5e$jv                  jp                  e5e$jx                  jp                  e7e$jz                  jp                  e7iZ>d) Z?d* Z@y)+    N)castDictOptionalTuple)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)	Placementloss_parallelc               #   <   K   t                d t                yw)a  
    A context manager that enables loss parallelism, where efficient parallelized loss computation
    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
    loss is supported.

    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

    Args:
        input (:class:`DTensor`):
            Input logits. Assumed to be sharded on the class dimension.
        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
            Must be ground truth class indices (class probabilities currently not supported).
            Assumed to be replicated across the ``DeviceMesh``.
        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
            If given, assumed to be replicated across the ``DeviceMesh``.
        label_smoothing:
            Currently not supported.

    Returns:
        A replicated :class:`DTensor`.

    Example:
        A sharded DTensor is manually created here to showcase the usage.
        In practice, it is usually the output of a TP module.

        >>> # xdoctest: +SKIP("distributed")
        >>> from torch.distributed.tensor.parallel import loss_parallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> device_mesh = init_device_mesh("cuda", (8,))
        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
        >>> target = torch.randint(16, (4,), device="cuda")
        >>> with loss_parallel():
        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
        >>>     loss.backward()
        >>> ...
    N)_enable_custom_loss_ops_disable_custom_loss_ops     Y/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/tensor/parallel/loss.pyr   r      s     T 	s   
placements.dimreturnc                 |    t        |       dk(  st        d      | d   j                  |      st        d| d      y)N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErroris_shard)r   r   s     r   _find_all_reduce_mesh_dimr#   P   sS    z?aZ
 	
 a=!!#&cdgchhij
 	
 r   meshc                    t        | t              r-| j                  |k(  r| S t        d| d| j                   d      t        | t        j
                        rt        j                  | ||d      S t        dt        |              )Nz	Expected z	 but got r   F)device_meshr   	run_checkzUnsupported type )	
isinstancer	   r   RuntimeErrortorchr   
from_local	TypeErrortype)tensorr   r$   s      r   _cast_to_dtensorr/   \   s     &'"
*M:,i@Q@Q?RRSTUU	FELL	)!!u
 	
 +DL>:;;r   op_callargskwargsc                 (   t         j                  j                  | ||      }t         j                  j                  j	                  |j
                        }t        |t              r|S t        |t              r|d   S t        dt        |       d      )Nr   zUnexpected tensor meta type: r   )r	   _op_dispatcherunwrap_to_op_infosharding_propagator_propagate_tensor_metaschemar(   r   tupler)   r-   )r0   r1   r2   op_infotensor_metas        r   r7   r7   l   s    
 $$66wfMG((<<SSK +z*	K	'1~:4;L:MQOPPr   c                    | j                         } |r| j                  t        j                  k(  sJ t	        j
                  | t        j                  j                        \  }}| j                  |      } | j                         dk(  r| }nYt        j                  | |d      }t        j                  |t        j                  j                  j                   ||f      }| |z
  }t        j"                  t        j$                  |      |d      }	t        j                  |	t        j                  j&                  j                   ||f      }	t        j(                  |	      }
||
z
  }|s|j                  |      }|S )N)type_promotion_kindr   T)keepdim)reduceOpgroup)
contiguousdtyper*   halfutilselementwise_dtypesELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtonumelamaxfuncol
all_reducec10dReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr$   mesh_dimcomputation_dtyperesult_dtypeshiftedx_maxshifted_sumexpshifted_logsumexpresults               r   _log_softmaxr_      s2   	Aww%**$$$&+&>&>	uDDLL'#| 	
AwwyA~

1c40!!DMM--224:J
 e)YYuyy13EN&&!2!2!7!7h?ON 		.1((F<(Mr   c                    t        t        |d         }t        t        |d         }t        t        |d         }|j                  }t        |j                  |      }t        | ||      }t        |j                  |||j                  |      }	t        |j                  |j                  |      }
t        |	|
|	j                        S )Nr   r      r;   requires_grad)r   r	   intbool_specr#   r   r7   r_   _local_tensorr$   r   rd   )r0   r1   r2   rU   r   rV   specrW   output_tensor_metaresres_specs              r   _log_softmax_handlerrm      s    
 	Wd1gA
sDG
CtAw'M77D(#>H/vF
q]DIIx
PC		&H '' r   c                     t        t        |d         }t        t        j                  |d         }|j	                  |      S )Nr      )r   r	   r*   rB   rH   )r0   r1   r2   grad_outputinput_dtypes        r   _log_softmax_backward_handlerrr      s7    
 wQ(Ku{{DG,K>>+&&r   rU   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimrW   c
                 P   | j                         ddk  rddt        dt        ffd}
| |
|      }|J  |
|      }| |z  } t        j                  ||k7  |d      }|j	                        }t        |      }|j                  |||	      }t        j                  | |      }|j                  |||	      }|j                         }t        j                  ||k7  |d      }|t        j                  j                  k(  rdkD  r| j                  dd	      }||fS ||t        | j                        }d
|<   j!                  |      }t        j                  ||      j                        }t        j                  ||k7  |d      }|j#                         }n"||k7  j#                         j%                  |       }|t        j&                  j                  k(  r|j#                         }||fS |t        j(                  j                  k(  r|j#                         |z  }||fS )Nr   ra   r   rt   r   c                 l    dkD  r+dgz  }| j                   d   |<   | j                  |      }|S | }|S )Nr   r   )shapeview)rt   r|   wry   n_dimss      r   _weight_viewz'_nll_loss_forward.<locals>._weight_view   sQ    A:E "(aE+E"A  Ar   offset_shape
offset_dimr   g        )r   r   r*   where	unsqueezer   _partition_valuegather_reduce_valuesqueezer   NONEvaluenew_fulllistr|   expandrQ   rH   rS   MEAN)rU   rs   rt   ru   rv   rw   rx   ry   r$   rW   r   r~   local_wsafe_targetsafe_target_partial_placementsafe_target_partial_result_partialresult_reducedr^   total_weight	new_shapewsumr   s          `               @r   _nll_loss_forwardr      s    UUWFKz	V 	 	  '''|,K++f4fa@K((5L %++V,==dH \\![2FGN&44^T8TN$$[11F[[</;FINN(((VaZzz"c*|##M	!#	+HHY||A{L9AA+N{{6\14;xxz,.33588; IMM''' < 
inn**	*,<r   c                    t        t        |d         }|d   }|d   }t        t        |d         }t        t        |d         }|j                         dk\  rdnd}|j                  |   }	|j
                  }
t        |
j                  |      }t        t        |
j                  |g      |      }t               f|
j                  j                  z  }t        |||
j                        }d }|t        |||
j                        }t        |
j                  j                        D cg c]  }||k(  rt        d      n	t                }}|j!                  |
j                  |      j"                  }|j                  d   |j"                  j                  |   k(  sJ |t$        j&                  j(                  k(  r|}n|}t+        |      }||c|d<   |d<   t-        | t/        |      |      }t1        |j"                  |j"                  ||j"                  nd ||||j                  ||
j                  |
      \  }}t3        |
j                  ||      }t        |||j4                        |fS c c}w )Nr   r   ra   ro      rb   rc   )r   r	   re   r   r|   rg   r#   r   r   r   r
   r$   ndimr/   ranger   redistributerh   r   r   r   r   r7   r9   r   r   rd   )r0   r1   r2   rU   rs   rt   rv   rw   ry   channel_dim_sizeri   rW   target_placementsall_replicate_placementsru   isharded_placementsoutput_placementsrj   r^   r   out_specs                         r   _nll_loss_forward_handlerr     sM   
 	Wd1gA!WF!WFS$q'"IT!W%Luuw!|!Kww{+77D(+FH " ;-@+ !*~		>f&7CFL!&*BDIIN
 AFdiinn@U
;<XE!H9;6
 
 **4996HIWW!!!$(=(=k(JJJJINN(((-4 :DvDGT!W/tfM,	 & 2			FL 499&7EWXH 	 ..	

 	 =
s   "!I#rp   r   c                    |j                         dk  rdnd}|t        j                  j                  k(  r| |z  } |j	                  |      }t        j                  ||k7  |d      }t        j                  |      }t        ||      }|j                  |      j                         }|j                  ||	|
      }|j                  j                  J |j                  j                  j                  |j                        dz
  }t        j                   |j"                  d   |j$                        }|j                         dk(  r|||<   n|j                         dk(  r||||f<   ne|j'                  |d      }|j"                  }|j)                  d|j"                  |         }||||f<   |j+                  |      j'                  |d      }|j                         | j                         cxkD  rdkD  rn n| j	                  |      } |t-        |j                               D cg c]  }d }}|j"                  d   ||<   |j)                  |      }t/        |j"                        }d||<   |j1                  |      }t        j2                  |||      }| |z  } t        j                  ||k7  | d      } |t        j4                  |      z   | z  S c c}w )Nra   r   r   r   g      ?)devicer   )r   r   r   r   r   r*   r   
zeros_liker   r   flattenr   mask_bufferdatarH   rB   aranger|   r   	transposereshaper}   r   r   r   r   rR   )rp   rU   rs   rt   rv   rw   r   rx   ry   r$   rW   r   
grad_inputr   masked_safe_targetgrad_update	arange_1dgrad_input_tintermidate_shapegrad_input_2d_r   r~   w_targets                           r   "_nll_loss_and_log_softmax_backwardr   Z  s    uuw{!KINN(((!L0k*F++f4fa@K!!!$J %++V%%k2::<K*;;KxX((--999#//4477
8H8HICOK  #,>,E,EI
 	uuw!|)4
%&	
A4?
9001!++K<(..$,,R1EF7Bi!334"''(9:DD[RTU
~~+//+/a/!++K8 %aeeg/1Q/	/!'a	+	* M	!#	+MM)$<<;7!H,++f4k1EK 1%44# 0s   "	Kc                    t        t        |d         }t        t        |d         }|d   }|d   }t        t        |d         }t        t        |d         }t        t        |d         }	|j	                         dk\  rdnd}
|j
                  }t        |j                  |
      }t        t        |j                  |
g      |
      }t               f|j                  j                  z  }t        |||j                        }|t        |||j                        }t        |      }||c|d<   |d<   t        |	||j                        |d<   t        | t!        |      |      }t#        |j$                  |j$                  |j$                  ||j$                  nd |||	|j&                  |
|j                  |      }t)        |j                  |j                  |      }t        |||j*                  	      S )
Nr   r   ra   ro   r         rb   rc   )r   r	   re   r   r   rg   r#   r   r   r   r
   r$   r   r/   r   r7   r9   r   rh   r|   r   rd   )r0   r1   r2   rp   rU   rs   rt   rv   rw   r   ry   ri   rW   r   r   rj   r^   r   s                     r   _nll_loss_backward_handlerr     s   
 wQ(KWd1gA!WF!WFS$q'"IT!W%LQ(Luuw!|!K77D(+FH " ;-@+ !*~		>f&7CF!&*BDIIN :DvDGT!W|-EtyyQDG/tfM/!!	 & 2			F 		&H ** r   c                  ^    t         j                  j                  j                  t               y N)r	   r4   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s    ..556IJr   c                  l    t         D ]+  } t        j                  j                  j	                  |        - y r   )r   r	   r4   r   pop)	custom_ops    r   r   r     s-    ( B	2266yABr   )A
contextlibtypingr   r   r   r   r*   torch._prims_common_prims_commonrD   )torch.distributed._functional_collectivesdistributed_functional_collectivesrK   "torch.distributed.distributed_c10ddistributed_c10drM   r   torch.distributed.device_meshr   torch.distributed.tensorr	   r
   r   &torch.distributed.tensor._dtensor_specr   r   ,torch.distributed.tensor._ops._embedding_opsr   'torch.distributed.tensor._ops._math_opsr   r   r   (torch.distributed.tensor.placement_typesr   opsaten__all__contextmanagerr   re   r#   r/   _ops
OpOverloadobjectstrr7   r_   rm   rr   Sizer   r   r   r   default_log_softmax_backward_datanll_loss_forwardnll_loss2d_forwardnll_loss_backwardnll_loss2d_backwardr   r   r   r   r   r   <module>r      s    . .  # : : 1 1  4 > > J E 
 ? yy~~ 
 - -d	%	3*? 	c 	c 	<in-<5?<< QZZ""Q

Q fQ 	Q&6ZZ""

 f 	>'ZZ""'

' f' 	'F F F  VF  6"	F 
 F  F  F  F  F  F  66>F RBZZ""B

B fB 	BXB5B5B5 B5 V	B5
 B5 B5 B5 B5 B5 B5 B5 B5J8ZZ""8

8 f8 	8x 	3##++-J!!#<##%>""$>$$&@ KBr   