
    sgj                        d Z ddlZddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZ ddlmZ ddlm Z   ejB                  e"      Z#dZ$dZ%dZ&g dZ'dZ(dZ) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z- G d dejT                        Z. G d dejT                        Z/ G d d ejT                        Z0 G d! d"ejT                        Z1 G d# d$ejT                        Z2 G d% d&ejT                        Z3 G d' d(e      Z4d)Z5d*Z6 G d+ d,e4      Z7 ed-e5       G d. d/e4             Z8 ed0e5       G d1 d2e4             Z9y)3zPyTorch M-CTC-T model.    N)OptionalTupleUnion)nn   )ACT2FN)add_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)BaseModelOutputCausalLMOutput)PreTrainedModelapply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)logging   )MCTCTConfigr   zspeechbrain/m-ctc-t-large)r      i   zY"Mr. Quilter is the apostle of the middle classes, and we're glad to welcome his gospel."gv@c                   (     e Zd ZdZ fdZd Z xZS )MCTCTConv1dSubsamplerz
    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
    via gated linear units (https://arxiv.org/abs/1911.08460)
    c                 P    t                    | _        |j                   _        t        j                  |j                         _        |j                   _
        |j                  |j                  z   _         j                  dkD  r)|j                  t        d      |j                   _        nd  _        |j"                  dz   _        |j&                   _        |j*                   _        t        j.                   fdt1         j(                        D               _        y )Nr   zbNeed to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution layers.   c              3     K   | ]w  \  }}t        j                  |d k(  rj                  nj                  |   |j                  dz
  k  rj                  |   nj
                  |j                  |   d       y yw)r   r   valid)kernel_sizestridepaddingN)r   Conv1din_channelsmid_channels
num_layersout_channelsr    ).0ikselfs      f/var/www/html/venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py	<genexpr>z1MCTCTConv1dSubsampler.__init__.<locals>.<genexpr>Y   s      	)
 1 II$%F  0A0A!0D()DOOa,?(?!!!$TEVEV{{1~ 	)
s   A=B )super__init__configconv_glu_dimglu_dimr   Dropoutconv_dropoutdropoutnum_conv_layersr%   input_feat_per_channelinput_channelsr#   conv_channels
ValueErrorr$   hidden_sizer&   conv_kernelr   conv_strider    
ModuleList	enumerateconv_layersr*   r/   	__class__s   ` r+   r.   zMCTCTConv1dSubsampler.__init__=   s    **zz&"5"56 00!886;P;PP??Q##+  
 !' 4 4D $D"..2!--((
 == 	)
 "$"2"23	)
 	
    c                    t        | j                  D cg c]  }|dz  	 c}      }t        j                  j                  j                  |dd||fdd      }|j                  dd      j                         }| j                  D ]F  } ||      }t        j                  j                  || j                        }| j                  |      }H |j                  dd      j                         }|S c c}w )Nr   r   constantr   dim)sumr   torchr   
functionalpad	transpose
contiguousr?   glur1   r4   )r*   input_featuressizer!   hidden_statesconvs         r+   forwardzMCTCTConv1dSubsampler.forwardd   s     T-=-=>Ttqy>?,,00!QQXAY[eghi&00A6AAC$$ 	8D /MMM--m-NM LL7M	8
 &//15@@B ?s   C.__name__
__module____qualname____doc__r.   rR   __classcell__rA   s   @r+   r   r   7   s    
%
NrB   r   c                   ,     e Zd ZdZ fdZ	 ddZ xZS )MCTCTEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t               | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                  | j,                  j2                        d       y )N)padding_idxposition_ids)r   F)
persistenttoken_type_idsdtypedevice)r-   r.   r   	Embedding
vocab_sizer:   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsMCTCTLayerNorm	LayerNormr2   hidden_dropout_probr4   register_bufferrH   arangeexpandzerosr^   rO   longrd   r@   s     r+   r.   zMCTCTEmbeddings.__init__w   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"
 ()zz&"<"<= 	ELL)G)GHOOPWXej 	 	
 	KK))..0

4K\K\KcKcd 	 	
rB   c                    ||j                         n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  |      }| j                  |      }|S )Nr_   r   ra   r   rb   )rO   r^   hasattrra   rr   rH   rs   rt   rd   rh   rl   rn   r4   )r*   rN   ra   r^   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrl   
embeddingss               r+   rR   zMCTCTEmbeddings.forward   s    0>/In))+}OaOaOcdgegOh ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00@M $ : :> J"%::
^^J/
\\*-
rB   )NNNNr   rS   rY   s   @r+   r[   r[   t   s    Q
. wxrB   r[   c                   >     e Zd Z fdZd Zd Zd Z	 	 	 ddZ xZS )MCTCTSelfAttentionc                 Z   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        |j                  | _        | j                  | j                  z  | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                  d      | _        t        j                  |j                        | _        |j"                  | _        t        j$                  d|j"                  z  d	z
  | j                        | _        |j(                  | _        y )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()Fbiasr   r   )r-   r.   r:   num_attention_headsrv   r9   attention_head_dimattention_head_sizeall_head_sizer   Linearquerykeyvaluer2   attention_probs_dropout_probr4   ri   re   distance_embedding
is_decoderr@   s     r+   r.   zMCTCTSelfAttention.__init__   sX    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #)#<#< !558P8PPYYv1143E3EER
99V//1C1C%PYYv1143E3EER
zz&"E"EF'-'E'E$"$,,q63Q3Q/QTU/UW[WoWo"p ++rB   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr_   r   r   r      )rO   r   r   viewpermute)r*   xnew_x_shapes      r+   transpose_for_scoresz'MCTCTSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$rB   c           	         t        |j                        dkD  r4 |j                  t        t	        t        |j                                     }  |j
                  t        |       j                  t        t	        t        |                   S )Nr   )lenshaper   reversedrangereshape)r*   r   r   s      r+   reshape_fortranz"MCTCTSelfAttention.reshape_fortran   sd    qww<!		8E#agg,$789A2yqyy(5/*22HU3u:=N4OPPrB   c           	         |j                  dddd      }|j                  \  }}}}t        j                  |t        j                  ||||f|j
                        fd      }| j                  ||||z   |z  d|g      }|d d d ||z   dz
  |z  f   }| j                  ||||z   dz
  ||g      }|dz  }|d d |||z   f   j                  dd      }|j                  dddd      S )Nr   r   r   r   rd   rE   )r   r   rH   catrs   rd   r   rK   )r*   scoresbatchhidden_stateseq_lenheads	halfpoints          r+   "relative_position_embedding_rotatez5MCTCTSelfAttention.relative_position_embedding_rotate   s    1a+.4ll+|We FEKK%0PY_YfYf$ghnop %%fu|g7MQX6XZ[]b.cd Cg4q8GCCCD %%fulW6Lq6PRY[`.ab A%	9y7':::;EEaK~~aAq))rB   c                 
   | j                  |      }|t        j                  | j                        z  }| j	                  | j                  |            }| j	                  | j                  |            }| j	                  |      }t        j                  ||j                  dd            }	| j                  j                  }
t        j                  d|
|j                  dd            }| j                  |      }|	|z   }	||	|z   }	t        j                  j!                  |	d      }| j#                  |      }|||z  }t        j                  ||      }|j%                  dddd      j'                  d	      }|r||f}|S |f}|S )
Nr_   zlh, bche -> bcler   r   rE   r   r   )	start_dim)r   mathsqrtr   r   r   r   rH   matmulrK   r   weighteinsumr   r   rI   softmaxr4   r   flatten)r*   rP   attention_mask	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scorespositional_embeddingrelative_position_scoresattention_probscontext_layeroutputss                  r+   rR   zMCTCTSelfAttention.forward   s    !JJ}5-		$:R:R0SS--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ  $66==#(<<0BDXZeZoZopqstZu#v #'#J#JKc#d +.FF%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9AABAO6G=/2 O\M]rB   NNF)	rT   rU   rV   r.   r   r   r   rR   rX   rY   s   @r+   r   r      s(    ,.%
Q
*8 .rB   r   c                   $     e Zd Z fdZd Z xZS )rm   c                     t         |           t        j                  t	        j
                  d            | _        t        j                  t	        j                  d            | _        y Nr   )	r-   r.   r   	ParameterrH   onessingleton_weightrs   singleton_bias)r*   rA   s    r+   r.   zMCTCTLayerNorm.__init__  s@     "UZZ] ; ll5;;q>:rB   c                 :    || j                   z  | j                  z   S N)r   r   r*   rP   s     r+   rR   zMCTCTLayerNorm.forward   s     5 559L9LLLrB   rT   rU   rV   r.   rR   rX   rY   s   @r+   rm   rm     s    ;
MrB   rm   c                   $     e Zd Z fdZd Z xZS )MCTCTSelfOutputc                 :   t         |           || _        t        j                  |j
                  |j
                  d      | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y NFr   )eps)r-   r.   r/   r   r   r:   densern   layer_norm_epsr2   ro   r4   r@   s     r+   r.   zMCTCTSelfOutput.__init__%  si    YYv1163E3EER
f&8&8f>S>STzz&"<"<=rB   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r4   rn   r*   rP   input_tensors      r+   rR   zMCTCTSelfOutput.forward,  7    

=1]3}|'CDrB   r   rY   s   @r+   r   r   $  s    >rB   r   c                   2     e Zd Z fdZd Z	 	 	 ddZ xZS )MCTCTAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )r-   r.   r   r*   r   outputsetpruned_headsr@   s     r+   r.   zMCTCTAttention.__init__4  s0    &v.	%f-ErB   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   rE   )r   r   r*   r   r   r   r   r   r   r   r   r   r   union)r*   r   indexs      r+   prune_headszMCTCTAttention.prune_heads:  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:rB   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r*   r   )r*   rP   r   r   r   self_outputsattention_outputr   s           r+   rR   zMCTCTAttention.forwardL  sN     yy	
  ;;|AF#%QR(88rB   r   )rT   rU   rV   r.   r   rR   rX   rY   s   @r+   r   r   3  s    ";* rB   r   c                   $     e Zd Z fdZd Z xZS )MCTCTIntermediatec                    t         |           t        j                  |j                  |j
                  d      | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y )NFr   )r-   r.   r   r   r:   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr@   s     r+   r.   zMCTCTIntermediate.__init__`  s`    YYv1163K3KRWX
f''-'-f.?.?'@D$'-'8'8D$rB   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r   s     r+   rR   zMCTCTIntermediate.forwardh  s&    

=100?rB   r   rY   s   @r+   r   r   _  s    9rB   r   c                   $     e Zd Z fdZd Z xZS )MCTCTOutputc                 ,   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r-   r.   r   r   r   r:   r   rn   r   r2   ro   r4   r@   s     r+   r.   zMCTCTOutput.__init__o  sc    YYv779K9KRWX
f&8&8f>S>STzz&"<"<=rB   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r+   rR   zMCTCTOutput.forwardu  r   rB   r   rY   s   @r+   r   r   n  s    >rB   r   c                   8     e Zd Zdef fdZ	 	 	 ddZd Z xZS )
MCTCTLayerr/   c                     t         |           d| _        |j                  | _        t	        |      | _        t        |      | _        |j                  | _        t        |      | _
        y r   )r-   r.   seq_len_dimchunk_size_feed_forwardr   intermediater   	attentionr   r   r   r@   s     r+   r.   zMCTCTLayer.__init__}  sV    '-'E'E$-f5'/ ++!&)rB   c                     | j                  ||||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r   r   r   )r   r   feed_forward_chunkr   r   )	r*   rP   r   r   r   self_attention_outputsr   r   layer_outputs	            r+   rR   zMCTCTLayer.forward  st     "&>9HY "0 "
 2!4(,0##T%A%A4CSCSUe
  /G+rB   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r*   r   intermediate_outputr   s       r+   r   zMCTCTLayer.feed_forward_chunk  s,    "//0@A{{#68HIrB   r   )rT   rU   rV   r   r.   rR   r   rX   rY   s   @r+   r   r   |  s$    	*{ 	* *rB   r   c                   L    e Zd ZdZeZdZdZdZd Z	de
j                  fdZd Zy	)
MCTCTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    mctctrN   Tc                    | j                   j                  }t        |t        j                        rZ|j
                  j                  j                  d|       |j                  b|j                  j                  j                          n<t        |t        j                        re|j
                  j                  j                  d|       |j                  |j
                  j                  |j                     j                          nt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       nYt        |t              rI|j                  j                  j                  d       |j                   j                  j                          t        |t        j                  t        j"                  f      rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          yyy)zInitialize the weightsg        )meanstdNg      ?)r/   initializer_ranger   r   r   r   datanormal_r   zero_re   r]   rn   fill_rm   r   r   r"   )r*   moduler  s      r+   _init_weightsz"MCTCTPreTrainedModel._init_weights  s   kk++fbii( MM&&CS&9{{&  &&(-MM&&CS&9!!-""6#5#56<<>-KK""$MM$$S)/##((..s3!!&&,,.fryy"))45MM&&CS&9{{&  &&( ' 6rB   input_lengthsc                 (   d}t        t        | j                  j                        | j                  j                  | j                  j
                        D ]:  \  }}}|dz  }|d|z  z   ||dz
  z  z
  dz
  }t        j                  ||d      dz   }< |S )zH
        Computes the output length of the convolutional layers
        r   r   trunc)rounding_mode)zipr   r/   r5   r;   r<   rH   div)r*   r  dilation_	kernel_szr    r!   s          r+    _get_feat_extract_output_lengthsz5MCTCTPreTrainedModel._get_feat_extract_output_lengths  s     $'$++--.0G0GI`I`%
 	X Ay&  1nG)AK7(iRSm:TTWXXM!IImV7SVWWM	X rB   c                    t        |j                        dkD  r|d d d d df   }| j                  |j                  d            }|j	                         d   }t        j                  ||f|j                  |j                        }d|t        j                  ||j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr   r_   r   rb   r   r   )r   r   r  rG   rO   rH   rs   rc   rd   rq   flipcumsumrt   )r*   feature_vector_lengthr   subsampled_lengthsbszs        r+   "_get_feature_vector_attention_maskz7MCTCTPreTrainedModel._get_feature_vector_attention_mask  s     ~##$q(+Aq"H5N "BB>CUCUVXCYZ!!#A&'(0D0D^MbMb
 efS1F1FGI[^_I_`a',,bT299"=BBB4HMMOrB   N)rT   rU   rV   rW   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr
  rH   
LongTensorr  r   rB   r+   r   r     s;    
 L&O&*#)0e>N>N rB   r   aH  
    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_features (`torch.LongTensor` of shape `({0})`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
c                        e Zd Zdef fdZ	 	 	 ddej                  dej                  dej                  dededed	ee	e
f   fd
Z xZS )MCTCTEncoderr/   c                 $   t         |   |       |j                  | _        t               | _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _        y c c}w )NF)r-   r.   ro   rm   
layer_normr   rQ   r   r=   r   num_hidden_layersr   layersgradient_checkpointing)r*   r/   r  rA   s      r+   r.   zMCTCTEncoder.__init__  sm     #)#=#= (*)&1	mmvG_G_A`$aAZ%7$ab&+# %bs   'BrN   r   r   r   output_hidden_statesreturn_dictreturnc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }| j                  |      }|| j                  |j                  d   |      }t        j                  j                  || j                  | j                        }|t        ||j                        }|rdnd }	|rdnd }
|_|j                         d   t!        | j"                        k7  r6t%        dt!        | j"                         d|j                         d    d      t'               xs t)        |       }t+        | j"                        D ]  \  }}|r|	|fz   }	t-        j.                  g       }| j                  r|| j                   j0                  k  rdnd	}|r|rO| j2                  r3| j                  r'| j5                  |j6                  |||||   nd |      }n ||||
      }|d   }|rd}|s|
d   fz   }
 |r|	|fz   }	|st9        d ||	|
fD              S t;        ||	|
      S )Nr   )ptrainingr!  r   z&The head_mask should be specified for z layers, but it is for .TF)rP   r   r   )NNc              3   &   K   | ]	  }||  y wr   r!  )r'   vs     r+   r,   z'MCTCTEncoder.forward.<locals>.<genexpr>m  s     eqWXWdes   last_hidden_staterP   
attentions)r/   r   r)  use_return_dictr%  rQ   r  r   r   rI   r4   ro   r.  r   rc   rO   r   r'  r9   r   r   r>   rH   rand	layerdropr(  _gradient_checkpointing_func__call__tupler   )r*   rN   r   r   r   r)  r*  rw   rP   encoder_statesall_attentionssynced_gpusidxencoder_layerdropout_probabilityskip_the_layerlayer_outputss                    r+   rR   zMCTCTEncoder.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]8		.1 %!DD]EXEXYZE[]klN--mt?W?Wbfbobo-p %7H[H[\N30d  ~~"c$++&66 <S=M<N O%%.^^%5a%8$9< 
 12R6LT6R"+DKK"8 	FC#!/=2B!B #(**R.%)]]8KdkkNcNc8cTjoN![..4==$($E$E%..%&+4+@3d)%M %2&3'5*;%M !.a 0 , !/=3C2E!E?	FB  +}.>>Ne]NN$Seee+>Vd
 	
rB   )FFT)rT   rU   rV   r   r.   rH   Tensorboolr   r   r   rR   rX   rY   s   @r+   r#  r#    s    ,{ , #(%* R
R
 R
 <<	R

  R
 #R
 R
 
uo%	&R
rB   r#  zaThe bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.c                       e Zd Z fdZ eej                  d             eee	e
de      	 	 	 	 	 ddej                  deej                     deej                     dee   d	ee   d
ee   deee	f   fd              Z xZS )
MCTCTModelc                 r    t         |   |       || _        t        |      | _        | j                          y r   )r-   r.   r/   r#  encoder	post_initr@   s     r+   r.   zMCTCTModel.__init__x  s/     #F+ 	rB   zbatch_size, sequence_lengthaudio)
checkpointoutput_typer  modalityexpected_outputrN   r   r   r   r)  r*  r+  c                 J   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||||||      }|d   }|s	|f|dd  z   S t        ||j                  |j                        S )Nz#You have to specify input_features.r   r   r   r)  r*  r   r   r2  )	r/   r   r)  r5  r9   rH  r   rP   r4  )	r*   rN   r   r   r   r)  r*  encoder_outputssequence_outputs	            r+   rR   zMCTCTModel.forward  s    " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!BCC,,)/!5# ' 
 *!,#%(;;;-)77&11
 	
rB   )NNNNN)rT   rU   rV   r.   r   MCTCT_INPUTS_DOCSTRINGformatr	   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPErH   rC  r   rD  r   r   rR   rX   rY   s   @r+   rF  rF  s  s    
 ++A+H+HIf+gh&#$. 26,0,0/3&*#
#
 !.#
 ELL)	#

 $D>#
 'tn#
 d^#
 
uo%	&#
 i#
rB   rF  zcMCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).c                       e Zd Z fdZ ee       eeee	e
e      	 	 	 	 	 	 ddej                  deej                     deej                     dee   dee   dee   d	eej                      d
eeef   fd              Z xZS )MCTCTForCTCc                    t         |   |       t        |      | _        |j                  t        d| j                   d      |j                  }t        j                  ||j                        | _
        | j                          y )NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)r-   r.   rF  r   rf   r9   rA   r:   r   r   ctc_headrI  )r*   r/   output_hidden_sizerA   s      r+   r.   zMCTCTForCTC.__init__  s     '
$00@ AH H  $//		"4f6G6GH 	rB   )rK  rL  r  rN  expected_lossrN   r   r   r   r)  r*  labelsr+  c           
         |I|j                         | j                  j                  k\  r"t        d| j                  j                         ||n| j                  j                  }| j                  ||||||      }|d   }	| j                  |	      }
d}|o||n1t        j                  |j                  dd t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |
dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                  j0                  | j                  j2                  | j                  j4                        }ddd       |s|
f|t6        d z   }||f|z   S |S t9        ||
|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: rP  r   r_   )rc   )rF   rc   r   F)enabled)blank	reductionzero_infinity)losslogitsrP   r4  )maxr/   rf   r9   r5  r   r[  rH   r   r   rt   r  rG   tomasked_selectr   rI   log_softmaxfloat32rK   backendscudnnflagsctc_lossrg   ctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rP   r4  )r*   rN   r   r   r   r)  r*  r^  r   rP   re  rd  r  labels_masktarget_lengthsflattened_targets	log_probsr   s                     r+   rR   zMCTCTForCTC.forward  s)   2 &**,$++2H2H"HCDKKDZDZC[\]]%0%<k$++B]B]**)/!5#  
  
}- "- ZZ 4 4Sb 9L 
 !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#II)NNNNNN)rT   rU   rV   r.   r   rS  r	   rU  r   rV  _CTC_EXPECTED_OUTPUT_CTC_EXPECTED_LOSSrH   rC  r   rD  r   r   r   rR   rX   rY   s   @r+   rY  rY    s    
& ++AB&"$,( 26,0,0/3&*-1E
E
 !.E
 ELL)	E

 $D>E
 'tnE
 d^E
 ))*E
 
un$	%E
 CE
rB   rY  ):rW   r   typingr   r   r   rH   torch.utils.checkpointr   activationsr   
file_utilsr	   r
   r   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   r   r   r   utilsr   configuration_mctctr   
get_loggerrT   loggerrq  rV  rU  rW  rv  rw  Moduler   r[   r   rm   r   r   r   r   r   r   MCTCT_START_DOCSTRINGrS  r#  rF  rY  r!  rB   r+   <module>r     s     ) )    " r r A 8 C @   , 
		H	% !  2 '  t  :BII :z7bii 7ti iXMRYY Mbii )RYY )X		 ")) $ $NB? BJ	  @]
' ]
@ g5
% 5
	5
p ma
& a
	a
rB   