
    sgD                        d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZ d dlZd dlmZ d dlmZmZmZ d dlmZ  e j0                  e      Zdeeef   fdZdej:                  defd	Z G d
 de      Z  G d de      Z!y)    N)abcdefaultdict)	AnyDictIterableListOptionaloverloadSequenceTupleUnion)_MultiDeviceReplicator
GradScalerOptState)ProcessGroupreturnc                  (    t         j                  i dS )N)stagefound_inf_per_device)r   READY     ]/var/www/html/venv/lib/python3.12/site-packages/torch/distributed/fsdp/sharded_grad_scaler.py_refresh_per_optimizer_stater      s    ^^R@@r   tensorc                     | j                   xs9 | j                  j                  ddddt        j                  j                         fv S )Nxlacpuhpumtia)is_cudadevicetypetorch_C_get_privateuse1_backend_name)r   s    r   _is_supported_devicer'      sE    >> V]]//..04  r   c                   4    e Zd ZdZdej
                  ddfdZy)_GeneralMultiDeviceReplicatorz
    Lazily serves tensor to request device. This class extends
    _MultiDeviceReplicator to allow support for "cpu" as a device.
    master_tensorr   Nc                 :    t        |      sJ || _        i | _        y N)r'   master_per_device_tensors)selfr*   s     r   __init__z&_GeneralMultiDeviceReplicator.__init__#   s    #M222#EG r   )__name__
__module____qualname____doc__r$   Tensorr0   r   r   r   r)   r)      s!    
Hell Ht Hr   r)   c                       e Zd ZdZddddddej
                  j                  fded	ed
edede	de
dee   ddf fdZedej                   dej                   fd       Zedeej                      deej                      fd       Zedeej                   df   deej                   df   fd       Zedeej                      deej                      fd       Zdeej                   eej                      f   deej                   eej                      f   fdZdeej                      dej                   dej                   ddfdZ	 d$dej0                  j2                  dej                   dej                   de
deej6                  ej                   f   f
dZdej0                  j2                  ddfd Zdej                   ddfd!Zd%d"eeeej                   f      ddfd#Z xZ S )&ShardedGradScaleraA	  
    ShardedGradScaler helps perform gradient scaling in a shard aware manner. It extends
    functionality from GradScaler:
    * Supports Pytorch DDP and FSDP implementations
    * Support CPU offloaded tensors (as used in fully sharded data parallel[FSDP])
    * Supports the custom Mixed Precision loss dtype (fp16, bf16) that FSDP returns
    * Sync inf/nan for scaled gradient tensors on any torch.device (where tensors are placed) across
    nodes

    Example::

        # Creates a ShardedGradScaler once at the beginning of training.
        scaler = ShardedGradScaler()

        for epoch in epochs:
            for input, target in data:
                optimizer.zero_grad()
                output = model(input)
                loss = loss_fn(output, target)

                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
                scaler.scale(loss).backward()

                # scaler.step() first unscales gradients of the optimizer's params.
                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
                # otherwise, optimizer.step() is skipped.
                scaler.step(optimizer)

                # Updates the scale for next iteration.
                scaler.update()

    See :class:`GradScaler` for explanation of scaling/unscaling and more use cases.

    Args:
        init_scale (float, optional, default=2.**16):  Initial scale factor.
        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
            :meth:`update` if inf/NaN gradients occur in an iteration.
        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
            that must occur for the scale to be multiplied by ``growth_factor``.
        enabled (bool, optional):  If ``False``, disables gradient scaling. :meth:`step` simply
            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
            Default: ``True``
        process_group (ProcessGroup, optional, default=torch.distributed.group.WORLD):
            process group for sharding
    cudag      @g      ?g       @i  Tr"   
init_scalebackoff_factorgrowth_factorgrowth_intervalenabledprocess_groupr   Nc                     t         |   ||||||       | j                  r|| _        t	        t
              | _        y y )N)r9   r:   r;   r<   r=   )superr0   _enabledr>   r   r   _per_optimizer_states)	r/   r"   r9   r:   r;   r<   r=   r>   	__class__s	           r   r0   zShardedGradScaler.__init__Z   sM     	!)'+ 	 	
 ==!.D)45Q)RD& r   outputsc                      y r,   r   r/   rD   s     r   scalezShardedGradScaler.scalep       r   c                      y r,   r   rF   s     r   rG   zShardedGradScaler.scalet   rH   r   .c                      y r,   r   rF   s     r   rG   zShardedGradScaler.scalex   rH   r   c                      y r,   r   rF   s     r   rG   zShardedGradScaler.scale|   rH   r   c                      j                   s|S t        |t        j                        rt	        |      sJ  j
                   j                  |j                          j
                  J | j
                  j                  |j                  d      z  }|j                  |j                        S g dt        t        j                  t        t        j                     f   f fd |      S )NTr"   non_blockingvalc                 L   t        | t        j                        rt        |       sJ t	              dk(  rYj
                  j                  | j                         j
                  J j                  t        j
                               | d   j                  | j                        z  }|j                  | j                        S t        | t        j                        r5t        |       }t        | t         t"        f      r t        |       |      S |S t%        d      )Nr   z2outputs must be a Tensor or an iterable of Tensors)
isinstancer$   r5   r'   len_scale_lazy_init_scale_growth_trackerr"   appendr)   getr#   dtyper   r   maplisttuple
ValueError)rO   
scaled_valiteratorapply_scaler/   stashs      r   r^   z,ShardedGradScaler.scale.<locals>.apply_scale   s    #u||,+C000u:?{{*<<SZZH;;222LL!>t{{!KL 58<<

#;;
 "syy11#s||,{C0cD%=1$49X..QRRr   )rA   rQ   r$   r5   r'   rS   rT   r"   tor#   rW   r   r   )r/   rD   scaled_outputr^   r_   s   `  @@r   rG   zShardedGradScaler.scale   s     }}Ngu||,'000{{"44W^^D;;***#dkknn~~D '5 ' M !%%gmm4457	SU5<<%,,1G#GH 	S( 7##r   grads	found_inf	inv_scalec                 l   t        |      dk(  ry |j                         dk(  sJ d       |j                         dk(  sJ d       |D ]  }|j                  j                  dk7  r+t        j                  d|j                         t        d      t        j                  |      j                         j                         du s3t        j                  |      j                         j                         du rt        j                  d	g      |_         y |xj                  |j                         z  c_         y )
Nr      z%inv_scale must be a 1-element tensor.z%found_inf must be a 1-element tensor.r   z2tensor device is %s but was expected to be ``cpu``zDGradients were found on a non-CPU device when expected to be on CPU.T      ?)rR   numelr"   r#   loggererrorr[   r$   isinfanyitemisnanr   data)r/   rb   rc   rd   grads        r   *_foreach_non_finite_check_and_unscale_cpu_z<ShardedGradScaler._foreach_non_finite_check_and_unscale_cpu_   s    u:? A%N'NN% A%N'NN% 	.D{{5(HKK !. 
 D!%%',,.$6;;t$((*//1T9!&se!4			Y^^--	#	.r   	optimizer
allow_fp16c           
      "   t        |      }t        |      }t        d       }t        j                         5  |j                  D ]9  }|d   D ]-  }	|	j
                  |s2|	j
                  j                  t        j                  k(  rt        d      |	j
                  j                  r|	j
                  j                  t        j                  u r[|	j
                  j                  t        j                        j                         }
|
j                  t        j                        |	_        |	j
                  j                         }n|	j
                  }||j                     |j                     j                  |       0 < |j!                         D ]  \  }}|j#                         D ]  }|d   j                  j                  dk(  r2| j%                  ||j'                  |      |j'                  |             Qt        j(                  ||j'                  |      |j'                  |               	 d d d        |j*                  s3| j,                  J |j'                  | j,                  j                         |j*                  S # 1 sw Y   TxY w)Nc                       t        t              S r,   )r   rY   r   r   r   <lambda>z3ShardedGradScaler._unscale_grads_.<locals>.<lambda>   s    T9J r   paramsz%Attempting to unscale FP16 gradients.r   r   )r)   r   r$   no_gradparam_groupsrp   rW   float16r[   	is_sparser#   float32coalesce_valuesr"   rU   itemsvaluesrq   rV   *_amp_foreach_non_finite_check_and_unscale_r.   rS   )r/   rr   rd   rc   rs   per_device_inv_scaleper_device_found_infper_device_and_dtype_gradsgroupparamparam_grad_fp32
to_unscaler"   per_dtype_gradsrb   s                  r   _unscale_grads_z!ShardedGradScaler._unscale_grads_   s%     =YG<YG &11J%K"]]_ %	"// )"8_ )Ezz) &EJJ,<,<,M()PQQzz++
 !::++u}}<.3jjooemm.L.U.U.WO)8)=)=emm)LEJ%*ZZ%7%7%9
%*ZZ
.z/@/@A"((fZ())). ,F+K+K+M ',335 EQx++u4GG!044V<044V< HH!044V<044V<1%	R $77;;*** $$T[[%7%78#777Y%	 %	s   G:JJc                    | j                   sy | j                  d       | j                  t        |         }|d   t        j
                  u rt        d      |d   t        j                  u rt        d      | j                  J | j                  j                         j                         j                         }t        j                  ddt        j                  | j                  j                        }| j!                  |||d      |d	<   t        j
                  |d<   | j                  t        |         }g }g }g }|d	   j#                         D ]  }| j$                  d
k7  r|j                  j&                  d
k(  ro|j)                  |       |j+                  | j$                        }|j)                  |       |j)                  t-        j.                  |d| j0                               |j)                  t-        j.                  |d| j0                                |D ]  }	|	j3                           |rt        j4                  ||       y y )Nunscale_r   zMunscale_() has already been called on this optimizer since the last update().z(unscale_() is being called after step().)rf   g        )rW   r"   Tr   r   )async_opr   )rA   _check_scale_growth_trackerrB   idr   UNSCALEDRuntimeErrorSTEPPEDrS   double
reciprocalfloatr$   fullr|   r"   r   r   _devicer#   rU   r`   dist
all_reducer>   wait_foreach_copy_)
r/   rr   optimizer_staterd   rc   worksfound_inf_on_cpusfound_inf_on_devicesfound_inf_on_deviceworks
             r   r   zShardedGradScaler.unscale_  s   }}((444R	]C7#x'8'88_  W%)9)99IJJ {{&&&KK&&(335;;=	JJ#U]]4;;3E3E
	 372F2Fy)T3
./ $,#4#4  44R	]C!()?@GGI 	I||u$)9)9)>)>%)G!((3&/ll4<<&@#$++,?@OO+d$BTBT OOIDDVDVW	  	DIIK	  !24HI r   c                    | j                   | j                  J |j                         dk\  r;| xj                   | j                  z  c_         | j                  j	                  d       y| j                  dz   }|| j
                  k(  r;| xj                   | j                  z  c_         | j                  j	                  d       y|| _        y)z
        If found_inf is 1.0 (True), then scale is multiplied by backoff_factor and growth_tracker is set to zero.
        Otherwise, scale is multiplied by the growth factor when the growth interval is reached.
        Nrg   r   rf   )rS   _growth_trackerrm   _backoff_factorfill__growth_interval_growth_factor)r/   rc   
successfuls      r   _amp_update_scale_cpu_z(ShardedGradScaler._amp_update_scale_cpu_<  s    
 {{&4+?+?+KKK>>s"KK4///K  &&q)--1JT222t222$$**1-'1$r   	new_scalec           	         | j                   sy| j                  d      \  }}|t        |t              r| j                  j                  |       nd}|j                  j                  | j                  k(  sJ |       |j                         dk(  sJ |       |j                  du sJ |       | j                  j                  |       n| j                  j                         D cg c]7  }|d   j                         D ]  }|j                  |j                  d      ! 9 }}}t        |      d	kD  sJ d
       |d	   }t        |      dkD  r"t!        dt        |            D ]
  }	|||	   z  } |j                  j                  dk(  r| j#                  |       nLt%        j&                  | j                  | j(                  || j*                  | j,                  | j.                         t1        t2              | _        yc c}}w )a  
        Updates the scale factor.
        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
        the scale is multiplied by ``growth_factor`` to increase it.
        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
        used directly, it's used to fill GradScaler's internal scale tensor. So if
        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
        affect the scale GradScaler uses internally.)
        Args:
            new_scale (float or :class:`torch.Tensor`, optional, default=None):  New scale factor.
        .. warning::
            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
            been invoked for all optimizers used this iteration.
        Nupdateznew_scale should be a float or a 1-element torch.cuda.FloatTensor or                     torch.FloatTensor with requires_grad=False.rf   Fr   TrM   r   z,No inf checks were recorded prior to update.r   )rA   r   rQ   r   rS   r   r"   r#   r   rh   requires_gradcopy_rB   r   r`   rR   ranger   r$   _amp_update_scale_r   r   r   r   r   r   )
r/   r   rS   r   reasonstaterc   
found_infsfound_inf_combinedis
             r   r   zShardedGradScaler.updateN  s   " }}"&"B"B8"L )U+!!),A '',,<DfD< (A-5v5- ..%7??7!!), "77>>@!&'=!>!E!E!G  FMMEEJ  z?Q&V(VV&!+A:"q#j/2 8A&*Q-7&8 }}!!U*++,>?((KK((&''(()) &11M%N"5s   &<G;)Tr,   )!r1   r2   r3   r4   r   r   WORLDstrr   intboolr	   r   r0   r
   r$   r5   rG   r   r   r   r   r   rq   optim	Optimizerr   r"   r   r   r   r   __classcell__)rC   s   @r   r7   r7   )   s   .d # #"#04

0@0@SS S 	S
 S S S  -S 
S, U\\ ell   T%,,/ D4F   U5<<#45 %c@Q:R   Xell3 8N  )$U\\8ELL+AAB)$	u||Xell33	4)$V.%. <<. <<	.
 
.F  =8;;((=8 <<=8 <<	=8
 =8 
ellELL(	)=8~2J%++"7"7 2JD 2Jh2 2 2$>Ouell/B)C D >OPT >Or   r7   )"loggingcollectionsr   r   typingr   r   r   r   r	   r
   r   r   r   r$   torch.distributeddistributedr   torch.amp.grad_scalerr   r   r   "torch.distributed.distributed_c10dr   	getLoggerr1   ri   r   r   r5   r   r'   r)   r7   r   r   r   <module>r      s     ( X X X    N N ; 
		8	$Ad38n A $ 	H$: 	HcO
 cOr   