
    sgA                        d Z ddlZddlmZmZmZmZ ddlZ	 ddl	m
Z
 ddlmZ  eej"                  j$                  d      r!ej"                  j$                  j&                  Znej"                  j$                  Z G d d	ej(                        Z	 	 	 	 	 	 	 	 	 dd
edededededededee   dee   dededeee      fdZ G d de
      Z G d d      Zy# eef$ r	 ddlm
Z
 Y w xY w)z?Functions and classes related to optimization (weight updates).    N)CallableListOptionalUnion)Adam   )keraslearning_rate_schedulec                   J     e Zd ZdZ	 	 d
dededededef
 fdZd Z	d	 Z
 xZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
    initial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                 h    t         |           || _        || _        || _        || _        || _        y N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__s         O/var/www/html/venv/lib/python3.12/site-packages/transformers/optimization_tf.pyr   zWarmUp.__init__8   s6     	%:"(
!2	    c                     t        j                   j                  xs d      5 }t        j                  t         j                        }t        j                   j
                  t         j                        }||z  } j                  t         j                  j                  | j                        z  t        j                  ||k  fd fd|      cd d d        S # 1 sw Y   y xY w)Nr   c                       S r    )warmup_learning_rates   r   <lambda>z!WarmUp.__call__.<locals>.<lambda>Q   s    , r   c                  @     j                   j                  z
        S r   )r   r   )r   steps   r   r   z!WarmUp.__call__.<locals>.<lambda>R   s    ..td6G6G/GH r   r   )tf
name_scoper   castfloat32r   r   mathpowr   cond)r   r    r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__zWarmUp.__call__G   s    ]]49901 	T !#bjj 9!#):):BJJ!G"36H"H#'#=#=L_aeakak@l#l 77!$66,H		 	 	s   B1C""C+c                 v    | j                   | j                  | j                  | j                  | j                  dS )Nr   r   r   r   r   r.   r   s    r   
get_configzWarmUp.get_configV   s5    %)%?%?!%!7!7 --ZZII
 	
r   )      ?N)__name__
__module____qualname____doc__floatr   intstrr   r,   r0   __classcell__r   s   @r   r   r   &   sO    , $ $ 	
  
r   r   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                     t         j                  | ||z
  | |z  |
      }|rt        | ||      }|	dkD  rt        ||	|||||g d|	      }||fS t        j
                  j                  ||||||      }||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r   decay_stepsend_learning_rater   )r   r   r           )	LayerNorm
layer_normbias)	learning_raterD   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrE   )rM   rN   rO   rP   rQ   rR   )	schedulesPolynomialDecayr   AdamWeightDecayr	   
optimizersr   )r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   r   rE   lr_schedule	optimizers                 r   create_optimizerrZ   `   s    \ ++%#&66!L0	 , K ")))

 3#%/ "0&I$;

	, k!! $$))% "0 * 
	 k!!r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 ddeeej                  f   dededededede	e
e      d	e	e
e      d
ef fdZe fd       Z fdZd Zd fd	Zd Zd fd	Zd fd	Z fdZd Z xZS )rV   a]
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://arxiv.org/abs/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`Dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    rM   rN   rO   rP   amsgradrD   rE   rS   r   c
                 Z    t        |   ||||||	fi |
 || _        || _        || _        y r   )r   r   rD   _include_in_weight_decay_exclude_from_weight_decay)r   rM   rN   rO   rP   r\   rD   rE   rS   r   kwargsr   s              r   r   zAdamWeightDecay.__init__   s:     	$YRXY!2(?%*C'r   c                 >    dt         i}t        t        |   ||      S )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   rV   from_config)clsconfigrb   r   s      r   rc   zAdamWeightDecay.from_config   s&     #F+_c6vn6]]r   c                     t         t        |   |||       t        j                  | j
                  d      |||f   d<   y )Nadam_weight_decay_rater!   rD   )r   rV   _prepare_localr"   constantrD   )r   
var_device	var_dtypeapply_stater   s       r   rh   zAdamWeightDecay._prepare_local   sA    ot3J	;WDFKK"")AE
Z+,-@Ar   c                     | j                  |j                        }|rI|j                  ||z  ||j                  |j                  j
                  f   d   z  | j                        S t        j                         S )NrD   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr"   no_op)r   varrM   rl   do_decays        r   _decay_weights_opz!AdamWeightDecay._decay_weights_op   sq    ,,SXX6>>#k3::syy?S?S2T&UVi&jj -- "   xxzr   c                 l    t        t        |       \  }}t        t        |   t        ||      fd|i|S )Nr   )listzipr   rV   apply_gradients)r   grads_and_varsr   r`   gradstvarsr   s         r   r|   zAdamWeightDecay.apply_gradients  s;    C01u_d;Cu<McTXc\bccr   c                     || j                   |   i fS |xs i }|j                  ||f      }|| j                  ||      }||||f<   |d   d|ifS )z1Retrieves the learning rate with the given state.lr_trl   )_decayed_lr_tget_fallback_apply_state)r   rj   rk   rl   coefficientss        r   _get_lrzAdamWeightDecay._get_lr  sw    %%i0"44!'R"
I'>?55j)LL3?KY/0F#m[%AAAr   c                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        t        | &  ||fi |cd d d        S # 1 sw Y   y xY wr   )
r   rq   rr   rs   rx   r"   control_dependenciesr   rV   _resource_apply_dense)r   gradrv   rl   r   r`   decayr   s          r   r   z%AdamWeightDecay._resource_apply_dense  s{    ||CJJ		0D0DkRf&&sD+>$$eW- 	[$EdCZSYZ	[ 	[ 	[s   A>>Bc                    | j                  |j                  |j                  j                  |      \  }}| j	                  |||      }t        j                  |g      5  t        t        | &  |||fi |cd d d        S # 1 sw Y   y xY wr   )
r   rq   rr   rs   rx   r"   r   r   rV   _resource_apply_sparse)	r   r   rv   indicesrl   r   r`   r   r   s	           r   r   z&AdamWeightDecay._resource_apply_sparse  s~    ||CJJ		0D0DkRf&&sD+>$$eW- 	e$FtSRYd]cd	e 	e 	es   A??Bc                 ^    t         |          }|j                  d| j                  i       |S )NrD   )r   r0   updaterD   )r   re   r   s     r   r0   zAdamWeightDecay.get_config  s-    #%*D,B,BCDr   c                     | j                   dk(  ry| j                  r)| j                  D ]  }t        j                  ||       y | j                  r)| j                  D ]  }t        j                  ||       y y)z0Whether to use L2 weight decay for `param_name`.r   FT)rD   r^   researchr_   )r   
param_namers      r   ro   z$AdamWeightDecay._do_use_weight_decay$  s~    !!Q&((22  99Q
+7  **44 !99Q
+7 ! r   )	gMbP??+?gHz>FrI   NNrV   r   )r2   r3   r4   r5   r   r6   rT   LearningRateScheduleboolr   r   r8   r   classmethodrc   rh   rx   r|   r   r   r   r0   ro   r9   r:   s   @r   rV   rV      s    $P GL#&7;9=%DUI$B$BBCD D 	D
 D D !D "*$s)!4D $,DI#6D D$ ^ ^

dB[e
r   rV   c                   B    e Zd ZdZd Zed        Zed        Zd Zd Z	y)GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                      g | _         d| _        y)zInitializes the accumulator.N)
_gradients_accum_stepsr/   s    r   r   zGradientAccumulator.__init__A  s     r   c                 0   | j                   qt        j                  t        j                  dt        j                        dt        j
                  j                  t        j                  j                        | _         | j                   j                         S )zNumber of accumulated steps.r   )rr   F	trainablesynchronizationaggregation)
r   r"   Variableri   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer/   s    r   r    zGradientAccumulator.stepF  sk     $ "ARXX. " : : B B22EE	!D   &&((r   c                     | j                   st        d      | j                   D cg c]  }||j                         n| c}S c c}w )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradients)r   
ValueErrorr   r   gradients     r   	gradientszGradientAccumulator.gradientsS  sB     abbW[WfWfg8H$8 hFgggs   Ac                    | j                   s| j                  }| j                   j                  |D cg c]b  }|\t        j                  t        j
                  |      dt        j                  j                  t        j                  j                        n|d c}       t        |      t        | j                         k7  r-t        dt        | j                          dt        |             t        | j                   |      D ]  \  }}|	||j                  |        | j                  j                  d       yc c}w )z/Accumulates `gradients` on the current replica.NFr   z	Expected z gradients, but got r   )r   r    extendr"   r   
zeros_liker   r   r   r   lenr   r{   
assign_addr   )r   r   _r   accum_gradients        r   r,   zGradientAccumulator.__call__Z  s   		AOO"" %.
 !  + KKh/"'(*(B(B(J(J$&$:$:$M$M	 ""
 y>S11yT__)=(>>RSVW`SaRbcdd(+DOOY(G 	4$NH)h.B))(3	4 	$$Q''
s   A'D?c                     | j                   sy| j                  j                  d       | j                   D ])  }||j                  t        j                  |             + y)z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr"   r   r   s     r   resetzGradientAccumulator.resett  sN      # 	9H#h 78	9r   N)
r2   r3   r4   r5   r   propertyr    r   r,   r   r   r   r   r   r   6  s@    !
 
) 
) h h(49r   r   )	rI   r   r   g:0yE>NNrI   r1   N)r5   r   typingr   r   r   r   
tensorflowr"   tf_keras.optimizers.legacyr   ImportErrorModuleNotFoundError"tensorflow.keras.optimizers.legacymodeling_tf_utilsr	   hasattrrW   rT   r
   r   r   r6   r7   r8   rZ   rV   r   r   r   r   <module>r      sc   F 	 2 2 8/ % 5%%'?@  **AAI  **I7
Y++ 7
| %),0"37Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q"h~d ~DE9 E9{ 	() 878s   C C%$C%