
    sg                       d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  e$jP                  e)      Z*dZ+dZ,g dZ-e G d de             Z.d Z/ G d dej`                        Z1 G d dej`                        Z2 G d dej`                        Z3 G d dej`                        Z4 G d dej`                        Z5 G d d ej`                        Z6 G d! d"ej`                        Z7 G d# d$ej`                        Z8 G d% d&ej`                        Z9 G d' d(ej`                        Z: G d) d*ej`                        Z; G d+ d,ej`                        Z< G d- d.ej`                        Z= G d/ d0ej`                        Z> G d1 d2e      Z?d3Z@d4ZA e"d5e@       G d6 d7e?             ZB e"d8e@       G d9 d:e?             ZC e"d;e@       G d< d=e?             ZD e"d>e@       G d? d@e?             ZE e"dAe@       G dB dCe?             ZFy)DzPyTorch CANINE model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputModelOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )CanineConfigzgoogle/canine-sr   )   +   ;   =   I   a   g   q                           c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	e
eej                        ed<   dZe
eej                        ed<   y)CanineModelOutputWithPoolinga  
    Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly
    different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow
    Transformer encoders.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final
            shallow Transformer encoder).
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Hidden-state of the first token of the sequence (classification token) at the last layer of the deep
            Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer
            weights are trained from the next sentence prediction (classification) objective during pretraining.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each
            encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length //
            config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the
            initial input to each Transformer encoder. The hidden states of the shallow encoders have length
            `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` //
            `config.downsampling_rate`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size,
            num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length //
            config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the
            attention softmax, used to compute the weighted average in the self-attention heads.
    Nlast_hidden_statepooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r0   torchFloatTensor__annotations__r1   r2   r   r   r3        ]/var/www/html/venv/lib/python3.12/site-packages/transformers/models/canine/modeling_canine.pyr/   r/   ;   s^    6 ,0u((/'+M5$$+8<M8E%"3"345<59Ju00129r<   r/   c           	         	 ddl }ddl}ddl}t        j                  j                  |      }t        j                  d|        |j                  j                  |      }g }g }	|D ]^  \  }
}t        j                  d|
 d|        |j                  j                  ||
      }|j                  |
       |	j                  |       ` t        ||	      D ]4  \  }
}|
j                  d      }
t!        d |
D              r(t        j                  d	dj#                  |
              R|
d   d
k(  rd|
d<   nU|
d   dk(  r|
j%                  |
d          n8|
d   dk(  rd|
d<   n*|
d   dk(  r
dg|
dd z   }
n|
d   dk(  r|
d   dv r	dg|
dd z   }
| }|
D ]  }|j'                  d|      rd|vr|j                  d|      }n|g}|d   dk(  s|d   dk(  rt)        |d      }nB|d   dk(  s|d   dk(  rt)        |d      }n%|d   dk(  rt)        |d      }n	 t)        ||d         }t-        |      d k\  st/        |d         }||   } d!d d"k(  rt)        |d      }nD|d#d t1        d$      D cg c]  }d%| 	 c}v rt)        |d      }n|dk(  r|j3                  |      }|j4                  |j4                  k7  r&t7        d&|j4                   d'|j4                   d(      t        j                  d)|
        t9        j:                  |      |_        7 | S # t        $ r t        j                  d        w xY w# t*        $ r+ t        j                  d	dj#                  |
              Y w xY wc c}w )*z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepclsautoregressive_decoderchar_output_weightsNr;   ).0ns     r=   	<genexpr>z,load_tf_weights_in_canine.<locals>.<genexpr>|   s$      
  	

s   z	Skipping bertencoderr   
embeddingssegment_embeddingstoken_type_embeddingsinitial_char_encoderchars_to_moleculesfinal_char_encoder)	LayerNormconv
projectionz[A-Za-z]+_\d+Embedderz_(\d+)kernelgammaweightoutput_biasbetabiasoutput_weights   i_embeddingsi   	Embedder_zPointer shape z and array shape z mismatchedzInitialize PyTorch weight )renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendzipsplitanyjoinremove	fullmatchgetattrAttributeErrorlenintrange	transposeshape
ValueErrorr8   
from_numpydata)modelconfigtf_checkpoint_pathrd   nptftf_path	init_varsnamesarraysnamer~   arraypointerm_namescope_namesnumis                     r=   load_tf_weights_in_caniner   ^   s   
 ggoo01G
KK8	BC''0IEF  e(l5'BC&&w5Te	 5&) D/ezz#  
 
 
 KK)CHHTN#3457fDG!W$KKQ !W,,-DG!W..()DI5D!W,,a<Q1Q >DH,D 	'F-v6Jf<T hhy&9%h1~)[^w-F!'84Q=0KNf4L!'62Q#33!'84%g{1~>G ;1$+a.)!#,'	'( #$<=(gx0GCD\uQx@!	!o@@gx0GxLL'E==EKK'~gmm_<Mekk]Zefgg078''.ID/J Lo  Q	
 	J & KK)CHHTN+; <= As#   L L0$M' L-00M$#M$c                        e Zd ZdZ fdZdedefdZdededefdZ	 	 	 	 ddee	j                     d	ee	j                     d
ee	j                     dee	j                     de	j                  f
dZ xZS )CanineEmbeddingsz<Construct the character, position and token_type embeddings.c           	         t         |           || _        |j                  |j                  z  }t        |j                        D ]2  }d| }t        | |t        j                  |j                  |             4 t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        | j%                  dt'        j(                  |j*                        j-                  d      d       t/        |dd      | _        y )	NHashBucketCodepointEmbedder_epsposition_ids)r   F)
persistentposition_embedding_typeabsolute)super__init__r   hidden_sizenum_hash_functionsr|   setattrr   	Embeddingnum_hash_bucketschar_position_embeddingstype_vocab_sizerP   rU   layer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr8   arangemax_position_embeddingsexpandrx   r   )selfr   shard_embedding_sizer   r   	__class__s        r=   r   zCanineEmbeddings.__init__   s7     &11V5N5NNv001 	]A1!5DD$V-D-DFZ [\	] )+V5L5LfN`N`(a%%'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$r<   
num_hashesnum_bucketsc                     |t        t              kD  rt        dt        t                     t        d| }g }|D ]  }|dz   |z  |z  }|j                  |         |S )a  
        Converts ids to hash bucket ids via multiple hashing.

        Args:
            input_ids: The codepoints or other IDs to be hashed.
            num_hashes: The number of hash functions to use.
            num_buckets: The number of hash buckets (i.e. embeddings in each table).

        Returns:
            A list of tensors, each of which is the hash bucket IDs from one hash function.
        z`num_hashes` must be <= Nr   )rz   _PRIMESr   rq   )r   	input_idsr   r   primesresult_tensorsprimehasheds           r=   _hash_bucket_tensorsz%CanineEmbeddings._hash_bucket_tensors   sp     G$7G~FGG*% 	*E 1}-<F!!&)	* r<   embedding_sizec                    ||z  dk7  rt        d| d| d      | j                  |||      }g }t        |      D ]-  \  }}d| }	 t        | |	      |      }
|j	                  |
       / t        j                  |d      S )	zDConverts IDs (e.g. codepoints) into embeddings via multiple hashing.r   zExpected `embedding_size` (z) % `num_hashes` (z) == 0)r   r   r   r   dim)r   r   	enumeraterx   rq   r8   cat)r   r   r   r   r   hash_bucket_tensorsembedding_shardsr   hash_bucket_idsr   shard_embeddingss              r=   _embed_hash_bucketsz$CanineEmbeddings._embed_hash_buckets   s    J&!+:>:JJ\]g\hhnopp"77	jfq7r"+,?"@ 	6A1!5D2wtT2?C##$45	6
 yy)r22r<   r   token_type_idsr   inputs_embedsreturnc                 `   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|P| j                  || j                  j                  | j                  j                  | j                  j                        }| j                  |      }||z   }| j                  dk(  r| j                  |      }	||	z  }| j                  |      }| j                  |      }|S )Nr   r   dtypedevicer   )sizer   r8   zeroslongr   r   r   r   r   r   rP   r   r   rU   r   )
r   r   r   r   r   input_shape
seq_lengthrP   rN   position_embeddingss
             r=   forwardzCanineEmbeddings.forward   s$     #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  444;;22DKK4R4RTXT_T_TpTpM !% : :> J"%::
'':5"&"?"?"M--J^^J/
\\*-
r<   )NNNN)r4   r5   r6   r7   r   r{   r   r   r   r8   
LongTensorr9   r   __classcell__r   s   @r=   r   r      s    F^0# C .3S 3c 3`c 3  15593759"E,,-" !!1!12" u//0	"
   1 12" 
		"r<   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )CharactersToMoleculeszeConvert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.c                 >   t         |           t        j                  |j                  |j                  |j
                  |j
                        | _        t        |j                     | _	        t        j                  |j                  |j                        | _
        y )Nin_channelsout_channelskernel_sizestrider   )r   r   r   Conv1dr   downsampling_raterV   r   
hidden_act
activationrU   r   r   r   r   s     r=   r   zCharactersToMolecules.__init__'  sv    II**++00++	
	 !!2!23 f&8&8f>S>STr<   char_encodingr   c                 2   |d d ddd d f   }t        j                  |dd      }| j                  |      }t        j                  |dd      }| j                  |      }|d d ddd d f   }t        j                  ||gd      }| j                  |      }|S )Nr   r   r`   r   r   )r8   r}   rV   r   r   rU   )r   r   cls_encodingdownsampleddownsampled_truncatedresults         r=   r   zCharactersToMolecules.forward6  s    $Q!QY/ q!<ii.ook1a8ook2 !,AqtQJ 7 L*?@aH'r<   )	r4   r5   r6   r7   r   r8   Tensorr   r   r   s   @r=   r   r   $  s'    oUU\\ ell r<   r   c                   |     e Zd ZdZ fdZ	 ddej                  deej                     dej                  fdZ xZ	S )ConvProjectionz
    Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size
    characters.
    c                    t         |           || _        t        j                  |j
                  dz  |j
                  |j                  d      | _        t        |j                     | _
        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y )Nr`   r   r   r   )r   r   r   r   r   r   upsampling_kernel_sizerV   r   r   r   rU   r   r   r   r   r   s     r=   r   zConvProjection.__init__X  s    II**Q.++55	
	 !!2!23 f&8&8f>S>STzz&"<"<=r<   inputsfinal_seq_char_positionsr   c                    t        j                  |dd      }| j                  j                  dz
  }|dz  }||z
  }t	        j
                  ||fd      }| j                   ||            }t        j                  |dd      }| j                  |      }| j                  |      }| j                  |      }|}|t        d      |}	|	S )Nr   r`   r   z,CanineForMaskedLM is currently not supported)r8   r}   r   r   r   ConstantPad1drV   r   rU   r   NotImplementedError)
r   r   r   	pad_totalpad_begpad_endpadr   final_char_seq	query_seqs
             r=   r   zConvProjection.forwardg  s     A.
 KK66:	q.g%1153v;'A.('f%#/
 &&TUU&Ir<   N)
r4   r5   r6   r7   r   r8   r   r   r   r   r   s   @r=   r   r   R  sE    
>$ <@"" #+5<<"8" 
	"r<   r   c                        e Zd Z fdZd Z	 	 	 d
dej                  dej                  deej                     deej                     dee	   de
ej                  eej                     f   fd	Z xZS )CanineSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rG|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        y y )Nr   r   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr`   r   )r   r   r   num_attention_headshasattrr   r{   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   rx   r   r   r   distance_embeddingr   s     r=   r   zCanineSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr<   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr   r   r`   r   r   )r   r  r  viewpermute)r   xnew_x_shapes      r=   transpose_for_scoresz(CanineSelfAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r<   from_tensor	to_tensorattention_mask	head_maskoutput_attentionsr   c                 P   | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }	t	        j
                  |	|j                  dd            }
| j                  dk(  s| j                  dk(  rF|j                         d   }t	        j                  |t        j                  |j                        j                  dd      }t	        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |	j                         }| j                  dk(  rt	        j"                  d|	|      }|
|z   }
nE| j                  dk(  r6t	        j"                  d|	|      }t	        j"                  d	||      }|
|z   |z   }
|
t%        j&                  | j(                        z  }
|h|j*                  d
k(  rTt	        j,                  |d      }d|j/                         z
  t	        j0                  |
j                         j2                  z  }|
|z   }
t4        j6                  j9                  |
d      }| j;                  |      }|||z  }t	        j
                  ||      }|j=                  dddd
      j?                         }|j                         d d | j@                  fz   } |j                  | }|r||f}|S |f}|S )Nr   rS   r   r  r   r   )r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r         ?r   r`   )!r  r  r  r	  r8   matmulr}   r   r   r   r   r   r  r  r   tor   einsummathsqrtr  ndim	unsqueezefloatfinfominr   
functionalsoftmaxr   r  
contiguousr  )r   r  r  r  r  r  mixed_query_layer	key_layervalue_layerquery_layerattention_scoresr   position_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                          r=   r   zCanineSelfAttention.forward  s    !JJ{3 --dhhy.AB	//

90EF//0AB !<<Y5H5HR5PQ''>9T=Y=Y]q=q$))+A.J"\\*EJJ{OaOabgghjlmnN"\\*EJJ{OaOabgghikmnN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%""a'!&Q!G #&(<(<(>">%++N^NdNdBeBiBi!i/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r<   NNF)r4   r5   r6   r   r  r8   r   r   r9   boolr   r   r   r   s   @r=   r   r     s    u,% 7;15,1E\\E <<E !!2!23	E
 E--.E $D>E 
u||Xell33	4Er<   r   c                        e Zd Z fdZdeej                     dej                  deej                  ej                  f   fdZ xZS )CanineSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denserU   r   r   r   r   r   s     r=   r   zCanineSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r<   r2   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r<  r   rU   r   r2   r=  s      r=   r   zCanineSelfOutput.forward  s9     

=1]3}|'CDr<   	r4   r5   r6   r   r   r8   r9   r   r   r   s   @r=   r9  r9    sL    >"5#4#45EJEVEV	u  %"3"33	4r<   r9  c                       e Zd ZdZ	 	 	 	 	 	 	 ddedededededef fdZd	 Z	 	 	 dd
ee	j                     dee	j                     dee	j                     dee   dee	j                  ee	j                     f   f
dZ xZS )CanineAttentionav  
    Additional arguments related to local attention:

        - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention.
        - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to
          attend
        to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`,
        *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all
        positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The
        width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to
        128) -- The number of elements to skip when moving to the next block in `from_tensor`. -
        **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in
        *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to
        skip when moving to the next block in `to_tensor`.
    always_attend_to_first_positionfirst_position_attends_to_allattend_from_chunk_widthattend_from_chunk_strideattend_to_chunk_widthattend_to_chunk_stridec	                 "   t         	|           t        |      | _        t	        |      | _        t               | _        || _        ||k  rt        d      ||k  rt        d      || _
        || _        || _        || _        || _        || _        y )Nze`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped.z``attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped.)r   r   r   r   r9  outputsetpruned_headslocalr   rD  rE  rF  rG  rH  rI  
r   r   rN  rD  rE  rF  rG  rH  rI  r   s
            r=   r   zCanineAttention.__init__  s     	'/	&v.E 
"%==w  !#99r  0O,-J*'>$(@%%:"&<#r<   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )rz   r   r   r  r  rM  r   r  r  r	  rK  r<  r  union)r   headsindexs      r=   prune_headszCanineAttention.prune_heads2  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r<   r2   r  r  r  r   c                    | j                   s| j                  |||||      }|d   }n|j                  d   x}}|x}	}
g }| j                  r|j	                  d       d}nd}t        ||| j                        D ].  }t        ||| j                  z         }|j	                  ||f       0 g }| j                  r|j	                  d|f       t        d|| j                        D ].  }t        ||| j                  z         }|j	                  ||f       0 t        |      t        |      k7  rt        d| d| d      g }g }t        ||      D ]  \  \  }}\  }}|	d d ||d d f   }|
d d ||d d f   }|d d ||||f   }| j                  rN|d d ||ddf   }t        j                   ||gd      }|
d d ddd d f   }t        j                   ||gd      }| j                  |||||      }|j	                  |d          |s|j	                  |d           t        j                   |d      }| j#                  ||      }|f}| j                   s
|dd  z   }|S |t%              z   }|S )	Nr   r   )r   r   z/Expected to have same number of `from_chunks` (z) and `to_chunks` (z). Check strides.r`   r   )rN  r   r~   rE  rq   r|   rG  r"  rF  rI  rH  rz   r   rr   rD  r8   r   rK  tuple)r   r2   r  r  r  self_outputsattention_outputfrom_seq_lengthto_seq_lengthr  r  from_chunks
from_startchunk_start	chunk_end	to_chunksattention_output_chunksattention_probs_chunksfrom_endto_startto_endfrom_tensor_chunkto_tensor_chunkattention_mask_chunkcls_attention_maskcls_positionattention_outputs_chunkr5  s                               r=   r   zCanineAttention.forwardD  s    zz99]M>S\^opL+A.;.A.A!.DDOm&33K) K11""6* 

$Z$B_B_` =t?[?[1[\	""K#;<=
 I11  !]!34$Qt7R7RS ;{T=W=W/WX	  +y!9:; ;3y>1 Ek] S$$/=0AC  ')#%'">A+y>Y N:&X(:6$/:h3F0I$J!"+Ax,A"B (6aH9LhW]o6]'^$77)7:h;NPQRSPS8S)T&+0996HJ^5_ef+g(#,Q!QY#7L&+ii0OUV&WO*.))%8LiYj+' (../Fq/IJ$*112I!2LM%N(  %yy)@aH;;'7G#%zzQR 00G  &< ==Gr<   FFF   rl  rl  rl  r6  )r4   r5   r6   r7   r7  r{   r   rT  r   r8   r9   r   r   r   r   s   @r=   rC  rC     s    & 05.3'*(+%(&)= *.	=
 (,= "%= #&=  #= !$=B;* 7;15,1HU../H !!2!23H E--.	H
 $D>H 
u  (5+<+<"==	>Hr<   rC  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CanineIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   r  r   intermediate_sizer<  
isinstancer   strr   intermediate_act_fnr   s     r=   r   zCanineIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r<   r2   r   c                 J    | j                  |      }| j                  |      }|S r   )r<  rs  r   r2   s     r=   r   zCanineIntermediate.forward  s&    

=100?r<   )r4   r5   r6   r   r8   r9   r   r   r   s   @r=   rn  rn    s'    9U%6%6 5;L;L r<   rn  c                   t     e Zd Z fdZdeej                     dej                  dej                  fdZ xZS )CanineOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r;  )r   r   r   r  rp  r   r<  rU   r   r   r   r   r   s     r=   r   zCanineOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r<   r2   r=  r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r?  r@  s      r=   r   zCanineOutput.forward  s7    

=1]3}|'CDr<   rA  r   s   @r=   rw  rw    s:    >U5+<+<%= UM^M^ chctct r<   rw  c                        e Zd Z fdZ	 	 	 d	deej                     deej                     deej                     dee   deej                  eej                     f   f
dZ	d Z
 xZS )
CanineLayerc	           
          t         	|           |j                  | _        d| _        t	        ||||||||      | _        t        |      | _        t        |      | _	        y Nr   )
r   r   chunk_size_feed_forwardseq_len_dimrC  	attentionrn  intermediaterw  rK  rO  s
            r=   r   zCanineLayer.__init__  se     	'-'E'E$(+)#$!"	
 /v6"6*r<   r2   r  r  r  r   c                     | j                  ||||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r  r   r   )r  r   feed_forward_chunkr~  r  )	r   r2   r  r  r  self_attention_outputsrX  r5  layer_outputs	            r=   r   zCanineLayer.forward  sv     "&/	 "0 "
 2!4(,0##T%A%A4CSCSUe
  /G+r<   c                 L    | j                  |      }| j                  ||      }|S r   )r  rK  )r   rX  intermediate_outputr  s       r=   r  zCanineLayer.feed_forward_chunk  s,    "//0@A{{#68HIr<   r6  )r4   r5   r6   r   r   r8   r9   r   r7  r   r  r   r   s   @r=   r{  r{    s    +< 7;15,1U../ !!2!23 E--.	
 $D> 
u  (5+<+<"==	>0r<   r{  c                        e Zd Z	 	 	 	 	 	 	 d
 fd	Z	 	 	 	 	 ddeej                     deej                     deej                     dee   dee   dee   de	ee
f   fd	Z xZS )CanineEncoderc	                     t         
|           || _        t        j                  t        |j                        D 	cg c]  }	t        ||||||||       c}	      | _        d| _	        y c c}	w )NF)
r   r   r   r   
ModuleListr|   num_hidden_layersr{  layergradient_checkpointing)r   r   rN  rD  rE  rF  rG  rH  rI  _r   s             r=   r   zCanineEncoder.__init__  sx     	]] v778  31+,)*	

 ',#s   A*r2   r  r  r  output_hidden_statesreturn_dictr   c                 x   |rdnd }|rdnd }t        | j                        D ]j  \  }	}
|r||fz   }|||	   nd }| j                  r,| j                  r | j	                  |
j
                  ||||      }n |
||||      }|d   }|sb||d   fz   }l |r||fz   }|st        d |||fD              S t        |||      S )Nr;   r   r   c              3   &   K   | ]	  }||  y wr   r;   rI   vs     r=   rK   z(CanineEncoder.forward.<locals>.<genexpr>*  s     mq_`_lm   )r0   r2   r3   )r   r  r  training_gradient_checkpointing_func__call__rV  r   )r   r2   r  r  r  r  r  all_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r=   r   zCanineEncoder.forward  s    #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]NO]n o)!,M &9]1=M<O&O#'	P*   1]4D Dm]4EGZ$[mmm++*
 	
r<   rk  )NNFFT)r4   r5   r6   r   r   r8   r9   r   r7  r   r   r   r   r   s   @r=   r  r    s     (-&+ #!$!",B 7;15,1/4&**
U../*
 !!2!23*
 E--.	*

 $D>*
 'tn*
 d^*
 
uo%	&*
r<   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r   r   r   r  r   r<  Tanhr   r   s     r=   r   zCaninePooler.__init__3  s9    YYv1163E3EF
'')r<   r2   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r<  r   )r   r2   first_token_tensorpooled_outputs       r=   r   zCaninePooler.forward8  s6     +1a40

#566r<   rA  r   s   @r=   r  r  2  s,    $
U5+<+<%= %BSBS r<   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CaninePredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r;  )r   r   r   r  r   r<  rq  r   rr  r   transform_act_fnrU   r   r   s     r=   r   z&CaninePredictionHeadTransform.__init__B  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr<   r2   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r<  r  rU   ru  s     r=   r   z%CaninePredictionHeadTransform.forwardK  s4    

=1--m<}5r<   rA  r   s   @r=   r  r  A  s-    UU5+<+<%= %BSBS r<   r  c                   \     e Zd Z fdZdeej                     dej                  fdZ xZS )CanineLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)r^   )r   r   r  	transformr   r  r   
vocab_sizedecoder	Parameterr8   r   r^   r   s     r=   r   zCanineLMPredictionHead.__init__S  sm    6v> yy!3!3V5F5FUSLLV->->!?@	 !IIr<   r2   r   c                 J    | j                  |      }| j                  |      }|S r   )r  r  ru  s     r=   r   zCanineLMPredictionHead.forward`  s$    }5]3r<   rA  r   s   @r=   r  r  R  s,    &U5+<+<%= %BSBS r<   r  c                   b     e Zd Z fdZdeej                     deej                     fdZ xZS )CanineOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r  predictionsr   s     r=   r   zCanineOnlyMLMHead.__init__g  s    1&9r<   sequence_outputr   c                 (    | j                  |      }|S r   )r  )r   r  prediction_scoress      r=   r   zCanineOnlyMLMHead.forwardk  s     !,,_=  r<   )	r4   r5   r6   r   r   r8   r   r   r   r   s   @r=   r  r  f  s1    :!u||,! 
u||	!r<   r  c                   &    e Zd ZdZeZeZdZdZ	d Z
y)CaninePreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    canineTc                 "   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNr  )rq  r   r  r   r[   r   normal_r   initializer_ranger^   zero_r   padding_idxrU   fill_)r   modules     r=   _init_weightsz#CaninePreTrainedModel._init_weights~  s   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r<   N)r4   r5   r6   r7   r   config_classr   load_tf_weightsbase_model_prefixsupports_gradient_checkpointingr  r;   r<   r=   r  r  s  s$    
  L/O &*#*r<   r  aI  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`CanineConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a5
  
    Args:
        input_ids (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z`The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Zd fd	Zd Zd Zdej                  defdZ	dej                  dej                  d	ej                  fd
Z
 eej                  d             eeee      	 	 	 	 	 	 	 	 	 ddeej&                     deej(                     deej&                     deej&                     deej(                     deej(                     dee   dee   dee   d	eeef   fd              Z xZS )CanineModelc           
         t         |   |       || _        t        j                  |      }d|_        t        |      | _        t        |ddd|j                  |j                  |j                  |j                        | _
        t        |      | _        t        |      | _        t        |      | _        t        |      | _        |rt#        |      nd | _        | j'                          y )Nr   TF)rN  rD  rE  rF  rG  rH  rI  )r   r   r   copydeepcopyr  r   char_embeddingsr  local_transformer_striderQ   r   rR   rM   r   rW   rT   r  pooler	post_init)r   r   add_pooling_layershallow_configr   s       r=   r   zCanineModel.__init__  s     v.+,(/7$1,1*/$*$C$C%+%D%D"("A"A#)#B#B	%
! #8"?$V,(0"/"?.?l6*T 	r<   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrM   r  r  rT  )r   heads_to_pruner  rR  s       r=   _prune_headszCanineModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr<   c                    |j                   d   |j                   d   }}|j                   d   }t        j                  ||d|f      j                         }t        j                  ||dft        j
                  |j                        }||z  }|S )aP  
        Create 3D attention mask from a 2D tensor mask.

        Args:
            from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
            to_mask: int32 Tensor of shape [batch_size, to_seq_length].

        Returns:
            float Tensor of shape [batch_size, from_seq_length, to_seq_length].
        r   r   )r   r   r   )r~   r8   reshaper   onesfloat32r   )r   r  to_mask
batch_sizerY  rZ  broadcast_onesmasks           r=   )_create_3d_attention_mask_from_input_maskz5CanineModel._create_3d_attention_mask_from_input_mask  s     '2&7&7&:K<M<Ma<PO
a(--*a)GHNNP
 *oq)IQVQ^Q^gnguguv 'r<   char_attention_maskr   c                     |j                   \  }}t        j                  ||d|f      }t        j                  j	                  ||      |j                               }t        j                  |d      }|S )z[Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.r   )r   r   r   r   )r~   r8   r  r   	MaxPool1dr   squeeze)r   r  r   r  char_seq_lenpoolable_char_maskpooled_molecule_maskmolecule_attention_masks           r=   _downsample_attention_maskz&CanineModel._downsample_attention_mask  sw     $7#<#< 
L"]]+>QP\@]^  %xx11>OXi1j$$& 

 #(--0D""M&&r<   	moleculeschar_seq_lengthr   c                    | j                   j                  }|ddddddf   }t        j                  ||d      }|ddddddf   }t        j                  t        j
                  |      t        j
                  |            j                         }t        j                  |||z   d      }t        j                  ||gd      S )zDRepeats molecules to make them the same length as the char sequence.Nr   rS   )repeatsr   r   r   )r   r   r8   repeat_interleavefmodtensoritemr   )	r   r  r  ratemolecules_without_extra_clsrepeatedlast_moleculeremainder_lengthremainder_repeateds	            r=   _repeat_moleculeszCanineModel._repeat_molecules#  s     {{,,&/12q&9#**+FPTZ\] "!RS!), ::ell?&CU\\RVEWX]]_"44$t+	
 yy($67R@@r<   batch_size, sequence_length
checkpointoutput_typer  r   r  r   r   r  r   r  r  r  c
                 B   ||n| j                   j                  }||n| j                   j                  }|rdnd }
|rdnd }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }n!||j                         d d }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }|&t        j                  |t        j                  |      }| j                  ||      }| j                  || j                   j                        }| j                  |||j                  d   f      }| j!                  || j                   j"                        }| j%                  ||||      }| j'                  ||n||      }| j)                  ||||	      }|j*                  }| j-                  |      }| j/                  ||||||	
      }|d   }| j0                  | j1                  |      nd }| j3                  ||d         }t        j4                  ||gd      }| j7                  |      }| j9                  ||||	      }|j*                  }|r2|	r|j:                  n|d   }|
|j:                  z   |z   |j:                  z   }
|r2|	r|j<                  n|d   } ||j<                  z   | z   |j<                  z   }|	s||f}!|!t?        d |
|fD              z  }!|!S tA        |||
|      S )Nr;   zDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embeds)r   r   )r   )r   r   r   r   )r  r  r  )r  r  r  r  r  r   )r  r   r   c              3   &   K   | ]	  }||  y wr   r;   r  s     r=   rK   z&CanineModel.forward.<locals>.<genexpr>  s     a!STS`Aar  )r0   r1   r2   r3   )!r   r  r  use_return_dictr   %warn_if_padding_and_no_attention_maskr   r   r8   r  r   r   get_extended_attention_maskr  r   r~   get_head_maskr  r  r  rQ   r0   rR   rM   r  r  r   rW   rT   r2   r3   rV  r/   )"r   r   r  r   r   r  r   r  r  r  r  r  r   r  r   r   extended_attention_maskr   extended_molecule_attention_maskinput_char_embeddingsr  init_chars_encoder_outputsinput_char_encodinginit_molecule_encodingencoder_outputsmolecule_sequence_outputr  repeated_moleculesconcatr  final_chars_encoder_outputsdeep_encoder_hidden_statesdeep_encoder_self_attentionsrK  s"                                     r=   r   zCanineModel.forward<  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 #7BD$5b4%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!"[[EJJvVN 150P0PQ_al0m"&"A"Adkk.K.K #B #
 :>9Y9Y#j2I2O2OPR2S%T:
( &&y$++2O2OP	 !% 4 4%)'	 !5 !
 #LL".IM>
 &*%>%>!./!5	 &? &
" 9JJ  "&!8!89L!M ,,";/!5# ' 
 $31#5 AEAX$<=^b "334L^ijl^m3n /1CD"M //&1 '+&=&=2/!5	 '> '
# 6GGJU)F)F[jkl[m&!,::;,- .;;<  IT?+E+EZijlZm(#,778./ .889   %}5Fea(9;N'OaaaFM+-'+*	
 	
r<   )T)	NNNNNNNNN)r4   r5   r6   r   r  r  r8   r   r{   r  r  r   CANINE_INPUTS_DOCSTRINGformatr   _CHECKPOINT_FOR_DOCr/   _CONFIG_FOR_DOCr   r   r9   r7  r   r   r   r   r   s   @r=   r  r    sy   
<C6'ell '_b '"A5<< A%,, A[`[g[g A2 ++B+I+IJg+hi&0$ 156:59371559,0/3&*\
E,,-\
 !!2!23\
 !!1!12	\

 u//0\
 E--.\
   1 12\
 $D>\
 'tn\
 d^\
 
u22	3\
 j\
r<   r  z
    CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deee	f   fd              Z xZS )CanineForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r   r   
num_labelsr  r  r   r   r   r   r  r   
classifierr  r   s     r=   r   z(CanineForSequenceClassification.__init__  i      ++!&)zz&"<"<=))F$6$68I8IJ 	r<   r  r  r   r  r   r   r  r   labelsr  r  r  r   c                 @   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|
s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r  r   r  r  r  r   
regressionsingle_label_classificationmulti_label_classificationr   r`   losslogitsr2   r3   )r   r  r  r   r  problem_typer  r   r8   r   r{   r
   r  r	   r  r   r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  r5  r  r%  r$  loss_fctrK  s                    r=   r   z'CanineForSequenceClassification.forward  s   2 &1%<k$++B]B]++))%'/!5#  

  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r<   
NNNNNNNNNN)r4   r5   r6   r   r   r  r  r   r  r   r  r   r8   r   r9   r7  r   r   r   r   r   s   @r=   r  r    sB   	 ++B+I+IJg+hi&,$ 156:59371559-1,0/3&*E
E,,-E
 !!2!23E
 !!1!12	E

 u//0E
 E--.E
   1 12E
 ))*E
 $D>E
 'tnE
 d^E
 
u..	/E
 jE
r<   r  z
    CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
    softmax) e.g. for RocStories/SWAG tasks.
    c                       e Zd Z fdZ eej                  d             eee	e
      	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee   dee   dee   deee	f   fd              Z xZS )CanineForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y r}  )r   r   r  r  r   r   r   r   r  r   r  r  r   s     r=   r   z CanineForMultipleChoice.__init__J  sV     !&)zz&"<"<=))F$6$6: 	r<   z(batch_size, num_choices, sequence_lengthr  r   r  r   r   r  r   r  r  r  r  r   c                 L   |
|
n| j                   j                  }
||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	|
	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r   rS   r  r`   r#  )r   r  r~   r  r   r  r   r  r	   r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  num_choicesr5  r  r%  reshaped_logitsr$  r'  rK  s                      r=   r   zCanineForMultipleChoice.forwardT  s   2 &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ++))%'/!5#  

  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r<   r(  )r4   r5   r6   r   r   r  r  r   r  r   r  r   r8   r   r9   r7  r   r   r   r   r   s   @r=   r*  r*  B  sB    ++B+I+IJt+uv&-$ 156:59371559-1,0/3&*@
E,,-@
 !!2!23@
 !!1!12	@

 u//0@
 E--.@
   1 12@
 ))*@
 $D>@
 'tn@
 d^@
 
u//	0@
 w@
r<   r*  z
    CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
    Named-Entity-Recognition (NER) tasks.
    c                       e Zd Z fdZ eej                  d             eee	      	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
e   de
e   de
e   deeef   fd              Z xZS )CanineForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   r  r   s     r=   r   z%CanineForTokenClassification.__init__  r  r<   r  )r  r  r   r  r   r   r  r   r  r  r  r  r   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, CanineForTokenClassification
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s")
        >>> model = CanineForTokenClassification.from_pretrained("google/canine-s")

        >>> inputs = tokenizer(
        ...     "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
        ... )

        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_token_class_ids = logits.argmax(-1)

        >>> # Note that tokens are classified rather then input words which means that
        >>> # there might be more predicted token classes than words.
        >>> # Multiple token classes might account for the same word
        >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]
        >>> predicted_tokens_classes  # doctest: +SKIP
        ```

        ```python
        >>> labels = predicted_token_class_ids
        >>> loss = model(**inputs, labels=labels).loss
        >>> round(loss.item(), 2)  # doctest: +SKIP
        ```Nr  r   r   r`   r#  )r   r  r  r   r  r	   r  r  r   r2   r3   )r   r   r  r   r   r  r   r  r  r  r  r5  r  r%  r$  r'  rK  s                    r=   r   z$CanineForTokenClassification.forward  s    f &1%<k$++B]B]++))%'/!5#  

 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r<   r(  )r4   r5   r6   r   r   r  r  r   r   r  r   r8   r   r9   r7  r   r   r   r   r   s   @r=   r0  r0    s=   	 ++B+I+IJg+hi+@_ 156:59371559-1,0/3&*R
E,,-R
 !!2!23R
 !!1!12	R

 u//0R
 E--.R
   1 12R
 ))*R
 $D>R
 'tnR
 d^R
 
u++	,R
 ` jR
r<   r0  z
    CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
    c                       e Zd Z fdZ eej                  d             edee	dd      	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
ej                     de
ej                     de
e   de
e   de
e   deeef   fd              Z xZS )CanineForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  r  r  r   r  r   
qa_outputsr  r   s     r=   r   z#CanineForQuestionAnswering.__init__  sS      ++!&)))F$6$68I8IJ 	r<   r  zSplend1dchan/canine-c-squadz'nice puppet'gQ!@)r  r  r  expected_outputexpected_lossr   r  r   r   r  r   start_positionsend_positionsr  r  r  r   c                    ||n| j                   j                  }| j                  |||||||	|
|	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   dz  }|s||f|dd z   }||f|z   S |S t        ||||j                  |j                  	      S )
a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        Nr  r   r   r   r   )ignore_indexr`   )r$  start_logits
end_logitsr2   r3   )r   r  r  r6  rs   r  rz   r   clamp_r	   r   r2   r3   )r   r   r  r   r   r  r   r9  r:  r  r  r  r5  r  r%  r=  r>  
total_lossignored_indexr'  
start_lossend_lossrK  s                          r=   r   z"CanineForQuestionAnswering.forward  s   @ &1%<k$++B]B]++))%'/!5#  

 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r<   )NNNNNNNNNNN)r4   r5   r6   r   r   r  r  r   r   r  r   r8   r   r9   r7  r   r   r   r   r   s   @r=   r4  r4    sa    ++B+I+IJg+hi00$' 156:593715596:48,0/3&*H
E,,-H
 !!2!23H
 !!1!12	H

 u//0H
 E--.H
   1 12H
 "%"2"23H
   0 01H
 $D>H
 'tnH
 d^H
 
u22	3H
 jH
r<   r4  )Gr7   r  r  rj   dataclassesr   typingr   r   r   r8   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_caniner   
get_loggerr4   rh   r  r  r   r/   r   Moduler   r   r   r   r9  rC  rn  rw  r{  r  r  r  r  r  r  CANINE_START_DOCSTRINGr  r  r  r*  r0  r4  r;   r<   r=   <module>rQ     s      	 ! ) )    A A !  . l l  / 
		H	%'   U :; : :D^Bbryy bJ+BII +\7RYY 7ta")) aHryy  Lbii L^ 299 7")) 7tI
BII I
X299 BII "RYY (
!		 
!*O *8	 / d fN
' N
	N
b  W
&; W
W
t  Q
3 Q
Q
h  `
#8 `
`
F  [
!6 [
[
r<   