
    sg                         d dl mZ d dlmZ d dlZd dlZddlmZm	Z	 ddl
mZ ddlmZmZ  e	j                  e      Z e ed	d	
             G d de             Zy)    )UserDict)UnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)has_feature_extractorhas_tokenizerc                   l     e Zd ZdZ fdZdeej                  ee	f   f fdZ
d Zd	dZd Zd Z xZS )
#ZeroShotAudioClassificationPipelinea  
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    <Tip warning={true}>

    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

    </Tip>

    Example:
    ```python
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
    ```


    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    c                 t    t        |   di | | j                  dk7  rt        d| j                   d      y )NptzThe z is only available in PyTorch. )super__init__	framework
ValueError	__class__)selfkwargsr   s     h/var/www/html/venv/lib/python3.12/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   z,ZeroShotAudioClassificationPipeline.__init__?   s>    "6">>T!tDNN#33QRSS "    audiosc                 $    t        |   |fi |S )a  
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`List[str]`):
                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The format used in conjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                already formatted.
        Return:
            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
            following keys:
            - **label** (`str`) -- One of the suggested *candidate_labels*.
            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                0 and 1, computed as the `softmax` of `logits_per_audio`.
        )r   __call__)r   r   r   r   s      r   r   z,ZeroShotAudioClassificationPipeline.__call__F   s    . w1&11r   c                 @    i }d|v r|d   |d<   d|v r|d   |d<   |i i fS )Ncandidate_labelshypothesis_templater   )r   r   preprocess_paramss      r   _sanitize_parametersz8ZeroShotAudioClassificationPipeline._sanitize_parameters_   sI    '4:;M4N01 F*7=>S7T34 "b((r   c                 <   t        |t              rg|j                  d      s|j                  d      r t        j                  |      j
                  }n%t        |d      5 }|j                         }d d d        t        |t              r t        || j                  j                        }t        |t        j                        st        d      t        |j                         dk7  rt#        d      | j                  |g| j                  j                  d      }| j$                  dk(  r|j'                  | j(                        }||d	<   |D cg c]  }|j+                  |       }}| j-                  || j$                  d
      }|g|d<   |S # 1 sw Y   xY wc c}w )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr    T)r'   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr	   feature_extractorr&   npndarray	TypeErrorlenshaper   r   totorch_dtypeformat	tokenizer)	r   audior    r!   finputsx	sequencesr)   s	            r   
preprocessz.ZeroShotAudioClassificationPipeline.preprocessh   sj   eS!	*e.>.>z.J !U+33%& %!FFHE% eU#t'='='K'KLE%,@AAu{{q mnn''G4#9#9#G#GX\ ( 
 >>T!YYt//0F%5!"<LMq(//2M	MnnYt~~W[n\!,})% %" Ns   FFFc                     |j                  d      }|j                  d      }t        |d   t              r|d   }n|d   d   } | j                  di ||}||j                  d}|S )Nr    r)   r   )r    logitsr   )popr*   r   modellogits_per_audio)r   model_inputsr    r)   outputsmodel_outputss         r   _forwardz,ZeroShotAudioClassificationPipeline._forward   s    '++,>?"&&}5k!nh/%a.K &a.+K$**;{;l; !1..
 r   c                    |j                  d      }|d   d   }| j                  dk(  r#|j                  d      }|j                         }nt	        d      t        t        ||      d       D cg c]
  \  }}||d	 }}}|S c c}}w )
Nr    rD   r   r   )dimz`tf` framework not supported.c                     | d    S )Nr   r   )r@   s    r   <lambda>zAZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>   s    _`ab_c^c r   )key)scorelabel)rE   r   softmaxtolistr   sortedzip)	r   rJ   r    rD   probsscoresrQ   candidate_labelresults	            r   postprocessz/ZeroShotAudioClassificationPipeline.postprocess   s    (,,-?@x(+>>T!NNqN)E\\^F<== +1V=M1NTc*d
& o6
 
 	
s   2B)NzThis is a sound of {}.)__name__
__module____qualname____doc__r   r   r4   r5   r2   r+   r   r#   rB   rK   r[   __classcell__)r   s   @r   r   r       s>    :T2uRZZ%;< 22):"r   r   )collectionsr   typingr   numpyr4   r-   utilsr   r   audio_classificationr	   baser
   r   
get_loggerr\   loggerr   r   r   r   <module>ri      sa    !    . 4 
		H	% ,4W[\]C( C ^Cr   