
    sgo                        d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
mZmZmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$  e#       rddl%Z% e$jL                  e'      Z(d	 Z)d
 Z*d Z+ G d de      Z,y)z Image processor class for OwlViT    N)DictListOptionalTupleUnion   )BaseImageProcessorBatchFeatureget_size_dict)center_cropcenter_to_corners_formatrescaleresizeto_channel_dimension_format)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatis_scaled_imagemake_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_torch_availableloggingc                    | j                         r>| j                  t        j                  t        j                  fv r| S | j                         S | j                  t        j                  t        j                  fv r| S | j                         S )N)	is_floating_pointdtypetorchfloat32float64floatint32int64int)ts    e/var/www/html/venv/lib/python3.12/site-packages/transformers/models/owlvit/image_processing_owlvit.py_upcastr,   5   s`    GGu}}==qL1779LGGU[[99qFquuwF    c                 f    t        |       } | dddf   | dddf   z
  | dddf   | dddf   z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.
    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r      )r,   )boxess    r+   box_arear2   =   sB     ENE!Q$K%1+%%1+ad*CDDr-   c                 ^   t        |       }t        |      }t        j                  | d d d d df   |d d d df         }t        j                  | d d d dd f   |d d dd f         }||z
  j	                  d      }|d d d d df   |d d d d df   z  }|d d d f   |z   |z
  }||z  }	|	|fS )Nr/   r   )minr0   )r2   r#   maxr4   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r+   box_iourA   L   s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29LAq!LAq$99E!T'NU"U*E
%-C:r-   c            !       
    e Zd ZdZdgZddej                  dddddddf
 fd	Z	 	 d!dej                  d	e
eef   d
ej                  deeeef      deeeef      dej                  fdZ	 	 d!dej                  de
eef   deeeef      deeeef      dej                  f
dZ	 	 d!dej                  dedeeeef      deeeef      dej                  f
dZ e       dddddddddddej*                  dfdedee   d	ee
eef      d
edee   dee
eef      dee   dee   dee   deeeee   f      deeeee   f      deeeef      deeef   deeeef      defd       Zd Z	 d"dedeeee   f   fdZd#d Z xZ S )$OwlViTImageProcessora	  
    Constructs an OWL-ViT image processor.

    This image processor inherits from [`ImageProcessingMixin`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the shorter edge of the input to a certain `size`.
        size (`Dict[str, int]`, *optional*, defaults to {"height": 768, "width": 768}):
            The size to use for resizing the image. Only has an effect if `do_resize` is set to `True`. If `size` is a
            sequence like (h, w), output size will be matched to this. If `size` is an int, then image will be resized
            to (size, size).
        resample (`int`, *optional*, defaults to `Resampling.BICUBIC`):
            An optional resampling filter. This can be one of `PIL.Image.Resampling.NEAREST`,
            `PIL.Image.Resampling.BOX`, `PIL.Image.Resampling.BILINEAR`, `PIL.Image.Resampling.HAMMING`,
            `PIL.Image.Resampling.BICUBIC` or `PIL.Image.Resampling.LANCZOS`. Only has an effect if `do_resize` is set
            to `True`.
        do_center_crop (`bool`, *optional*, defaults to `False`):
            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge, the
            image is padded with 0's and then center cropped.
        crop_size (`int`, *optional*, defaults to {"height": 768, "width": 768}):
            The size to use for center cropping the image. Only has an effect if `do_center_crop` is set to `True`.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the input by a certain factor.
        rescale_factor (`float`, *optional*, defaults to `1/255`):
            The factor to use for rescaling the image. Only has an effect if `do_rescale` is set to `True`.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with `image_mean` and `image_std`. Desired output size when applying
            center-cropping. Only has an effect if `do_center_crop` is set to `True`.
        image_mean (`List[int]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            The sequence of means for each channel, to be used when normalizing images.
        image_std (`List[int]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    pixel_valuesTNFgp?c                 h   ||nddd}t        |d      }||nddd}t        |d      }d|v r|j                  d      }||d<   t        |   di | || _        || _        || _        || _        || _        || _	        || _
        || _        |	|	nt        | _        |
|
| _        y t        | _        y )Ni   )heightwidthTdefault_to_squarer   
do_rescale )r   popsuper__init__	do_resizesizeresampledo_center_crop	crop_sizerJ   rescale_factordo_normalizer   
image_meanr   	image_std)selfrO   rP   rQ   rR   rS   rJ   rT   rU   rV   rW   kwargsrescale_val	__class__s                r+   rN   zOwlViTImageProcessor.__init__   s     'tc-JTT:!*!6IsUX<Y	!)tD	
  **Y/K#.F< "6""	 ,"$,((2(>*DT&/&;r-   imagerP   rQ   data_formatinput_data_formatreturnc                 t    t        |d      }d|vsd|vrt        d      t        ||d   |d   ff|||d|S )a2  
        Resize an image to a certain size.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                The size to resize the image to. Must contain height and width keys.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
                The resampling filter to use when resizing the input.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        TrH   rF   rG   z2size dictionary must contain height and width keys)rQ   r]   r^   )r   
ValueErrorr   )rX   r\   rP   rQ   r]   r^   rY   s          r+   r   zOwlViTImageProcessor.resize   se    2 TT:47$#6QRR(^T']+
 #/
 
 	
r-   rS   c                 r    t        |d      }d|vsd|vrt        d      t        ||d   |d   ff||d|S )a  
        Center crop an image to a certain size.

        Args:
            image (`np.ndarray`):
                Image to center crop.
            crop_size (`Dict[str, int]`):
                The size to center crop the image to. Must contain height and width keys.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        TrH   rF   rG   z7crop_size dictionary must contain height and width keysr]   r^   )r   ra   r   )rX   r\   rS   r]   r^   rY   s         r+   r   z OwlViTImageProcessor.center_crop   sd    , ")tD	9$y(@VWWx )G"45
 $/	

 
 	
r-   rT   c                      t        ||||      S )a  
        Rescale the image by the given factor. image = image * rescale_factor.

        Args:
            image (`np.ndarray`):
                Image to rescale.
            rescale_factor (`float`):
                The value to use for rescaling.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
                one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        rc   )r   )rX   r\   rT   r]   r^   s        r+   r   zOwlViTImageProcessor.rescale   s    4 un+Yjkkr-   imagesrO   rR   rJ   rU   rV   rW   return_tensorsc                    ||n| j                   }||n| j                  }||n| j                  }||n| j                  }||n| j                  }||n| j
                  }||n| j                  }|	|	n| j                  }	|
|
n| j                  }
||n| j                  }t        |      }t        |      st        d      t        |||	|
||||||
       |D cg c]  }t        |       }}t        |d         r|rt         j#                  d       |t%        |d         }|r"|D cg c]  }| j'                  ||||       }}|r!|D cg c]  }| j)                  |||       }}|r!|D cg c]  }| j+                  |||       }}|	r"|D cg c]  }| j-                  ||
||       }}|D cg c]  }t/        |||	       }}t1        d
|i|      }|S c c}w c c}w c c}w c c}w c c}w c c}w )a  
        Prepares an image or batch of images for the model.

        Args:
            images (`ImageInput`):
                The image or batch of images to be prepared. Expects a single or batch of images with pixel values
                ranging from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether or not to resize the input. If `True`, will resize the input to the size specified by `size`.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                The size to resize the input to. Only has an effect if `do_resize` is set to `True`.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                The resampling filter to use when resizing the input. Only has an effect if `do_resize` is set to
                `True`.
            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
                Whether or not to center crop the input. If `True`, will center crop the input to the size specified by
                `crop_size`.
            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
                The size to center crop the input to. Only has an effect if `do_center_crop` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether or not to rescale the input. If `True`, will rescale the input by dividing it by
                `rescale_factor`.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                The factor to rescale the input by. Only has an effect if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether or not to normalize the input. If `True`, will normalize the input by subtracting `image_mean`
                and dividing by `image_std`.
            image_mean (`Union[float, List[float]]`, *optional*, defaults to `self.image_mean`):
                The mean to subtract from the input when normalizing. Only has an effect if `do_normalize` is set to
                `True`.
            image_std (`Union[float, List[float]]`, *optional*, defaults to `self.image_std`):
                The standard deviation to divide the input by when normalizing. Only has an effect if `do_normalize` is
                set to `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: defaults to the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)
rJ   rT   rU   rV   rW   rR   rS   rO   rP   rQ   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)rP   rQ   r^   )rS   r^   )rT   r^   )meanstdr^   )input_channel_dimrD   )datatensor_type)rO   rP   rQ   rR   rS   rJ   rT   rU   rV   rW   r   r   ra   r   r   r   loggerwarning_oncer   r   r   r   	normalizer   r
   )rX   re   rO   rP   rQ   rR   rS   rJ   rT   rU   rV   rW   rf   r]   r^   r\   encoded_inputss                    r+   
preprocesszOwlViTImageProcessor.preprocess  s|   L "+!6IDNN	'tTYY'38+9+E4K^K^!*!6IDNN	#-#9Zt
+9+E4K^K^'3'?|TEVEV#-#9Zt
!*!6IDNN	$V,F#: 
 	&!)%!)	
 6<<E.'<<6!9%*s
 $ >vay I $ ExSdeF 
 oufk  )O` aF   $ U>UfgF 
  $ u:9XijF  ou
ej'{N_`
 
 &NF+CQ_`O =


s$   G*G#G(1G-G26G7c           	         t        j                  dt               |j                  |j                  }}t        |      t        |      k7  rt        d      |j                  d   dk7  rt        d      t        j                  |d      }t        j                  |j                        }|j                  }t        |      }|j                  d      \  }}	t        j                  |	||	|gd      j!                  |j"                        }
||
dddddf   z  }t%        |||      D cg c]  \  }}}|||d	 }}}}|S c c}}}w )
a=  
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                image size (before any data augmentation). For visualization, this should be the image size after data
                augment, but before padding.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        z`post_process` is deprecated and will be removed in v5 of Transformers, please use `post_process_object_detection` instead, with `threshold=0.` for equivalent results.TMake sure that you pass in as many target sizes as the batch dimension of the logitsr0   r/   TEach element of target_sizes must contain the size (h, w) of each image of the batchdimNscoreslabelsr1   )warningswarnFutureWarninglogits
pred_boxeslenra   shaper#   r5   sigmoidvaluesindicesr   unbindstacktodevicezip)rX   outputstarget_sizesr~   r1   probsry   rz   img_himg_w	scale_fctslbresultss                  r+   post_processz!OwlViTImageProcessor.post_process  s-   " 	d	
  (:(:v;#l++stta A%stt		&b)u||, )/ $**1-uKKue <!DGGU		!T1*--ILVU[]bIcddgaAa1q9dd es   'D>	thresholdr   c                 "   |j                   |j                  }}|"t        |      t        |      k7  rt        d      t	        j
                  |d      }t	        j                  |j                        }|j                  }t        |      }|t        |t              rMt	        j                  |D 	cg c]  }	|	d   	 c}	      }
t	        j                  |D 	cg c]  }	|	d   	 c}	      }n|j                  d      \  }
}t	        j                  ||
||
gd      j                  |j                         }||dddddf   z  }g }t#        |||      D ]3  \  }}}|||kD     }|||kD     }|||kD     }|j%                  |||d       5 |S c c}	w c c}	w )a|  
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        Nrs   ru   rv   r   r0   rx   )r~   r   r   ra   r#   r5   r   r   r   r   
isinstancer   Tensorr   r   r   r   r   append)rX   r   r   r   r~   r1   r   ry   rz   ir   r   r   r   r   r   r   scorelabelboxs                       r+   post_process_object_detectionz2OwlViTImageProcessor.post_process_object_detection  s   (  (:(:#6{c,// j  		&b)u||, )/ #,-L%Aqad%ABL%Aqad%AB+2215uUE5%$@aHKKELLYIIaqj11E6651 	MGAq!a)m$Ea)m$EA	M"CNNeusKL		M  &B%As   /FFc                 j   |j                   |j                  }}|"t        |      t        |      k7  rt        d      ||j                  d   dk7  rt        d      t        j                  |d      }t        j                  |j                        }t        |      }|dk  rt        |j                  d	         D ]g  }	t        j                  ||	          D ]I  }
||	   |
   st        ||	   |
ddf   j                  d	      ||	         d	   d	   }d
||
<   d||	   ||kD  <   K i |t        |t              rMt        j                   |D 
cg c]  }
|
d	   	 c}
      }t        j                   |D 
cg c]  }
|
d   	 c}
      }n|j#                  d      \  }}t        j$                  ||||gd      j'                  |j(                        }||dddddf   z  }g }t        j*                  |      }t        |j                  d	         D ]  }	||	   }|j-                         j/                         s'd|||k  <   t        j                  |      dz   }||dz  z
  |dz  z  }t        j0                  |dd      }|||	<   ||	   d	kD  }||	   |   }||	   |   }|j3                  |d|d        |S c c}
w c c}
w )a  
        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
        api.

        Args:
            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.0):
                Minimum confidence threshold to use to filter out predicted boxes.
            nms_threshold (`float`, *optional*, defaults to 0.3):
                IoU threshold for non-maximum suppression of overlapping boxes.
            target_sizes (`torch.Tensor`, *optional*):
                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
                None, predictions will not be unnormalized.

        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model. All labels are set to None as
            `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
        Nrs   r0   r/   rt   ru   rv   g      ?r   g              gư>皙?g?rx   )r~   target_pred_boxesr   ra   r   r#   r5   r   r   r   rangeargsortrA   	unsqueezer   r   tensorr   r   r   r   
zeros_likenonzeronumelclipr   )rX   r   r   nms_thresholdr   r~   target_boxesr   ry   idxr   iousr   r   r   r   alphasquery_scores	max_scorequery_alphasmask
box_scoresr1   s                          r+   #post_process_image_guided_detectionz8OwlViTImageProcessor.post_process_image_guided_detection  s   ,  '~~w/H/H#Fs<7H(Hstt#(:(:1(=(Bstt		&b)u||, 0= 3\//23 <s|4 <A!#;q> "<#4QT#:#D#DQ#GVYIZ[\]^_`aD"DG8;F3K} 45<< #,-L%Aqad%ABL%Aqad%AB+2215uUE5%$@aHKKLL_L_`I')AtQJ*??L !!&)++A./ 	SC!#;L'')//1 69L	12 		,/$6I(IO<SQL ::lC=L&F3K#;?DT*J %d+ENNjD5QR'	S* A &B%As   	J+/J0)NN)r   N)r   g333333?N)!__name__
__module____qualname____doc__model_input_namesr   BICUBICrN   npndarrayr   strr)   r   r   r   r   r   r&   r   r   FIRSTr   boolr   r   r
   rq   r   r   r   r   __classcell__)r[   s   @r+   rC   rC   \   s<   "H (( #++%QX ?CDH$
zz$
 38n$
 %,,	$

 eC)9$9:;$
 $E#/?*?$@A$
 
$
T ?CDH 
zz 
 S> 
 eC)9$9:;	 

 $E#/?*?$@A 
 
 
N ?CDHlzzl l eC)9$9:;	l
 $E#/?*?$@Al 
l8 %& %))-'+)-.2%)*.'+:>9=;?4D4J4JDHMM D>M tCH~&	M
 %M !M DcN+M TNM !M tnM U5$u+#567M E%e"456M !z3!78M 3 001M $E#/?*?$@AM  
!M 'M^,^ _c5"'5=B:tTY{CZ=[5pQr-   rC   )-r   r{   typingr   r   r   r   r   numpyr   image_processing_utilsr	   r
   r   image_transformsr   r   r   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r#   
get_loggerr   rm   r,   r2   rA   rC   rK   r-   r+   <module>r      s    '  5 5  U U     ^ ]  
		H	%GE z- zr-   