
    sgh                     V   d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
mZmZmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&  e#       rddl'Z' e$       rddl(Z( e"       rdd	l)m*Z+  e%jX                  e-      Z.d
 Z/d Z0d Z1d Z2d Z3 G d de      Z4y)z Image processor class for OWLv2.    N)DictListOptionalTupleUnion   )BaseImageProcessorBatchFeatureget_size_dict)center_to_corners_formatpadto_channel_dimension_format)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_scipy_availableis_torch_availableis_vision_availableloggingrequires_backends)ndimagec                    | j                         r>| j                  t        j                  t        j                  fv r| S | j                         S | j                  t        j                  t        j                  fv r| S | j                         S )N)	is_floating_pointdtypetorchfloat32float64floatint32int64int)ts    c/var/www/html/venv/lib/python3.12/site-packages/transformers/models/owlv2/image_processing_owlv2.py_upcastr/   D   s`    GGu}}==qL1779LGGU[[99qFquuwF    c                 f    t        |       } | dddf   | dddf   z
  | dddf   | dddf   z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.
    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r      )r/   )boxess    r.   box_arear5   M   sB     ENE!Q$K%1+%%1+ad*CDDr0   c                 ^   t        |       }t        |      }t        j                  | d d d d df   |d d d df         }t        j                  | d d d dd f   |d d dd f         }||z
  j	                  d      }|d d d d df   |d d d d df   z  }|d d d f   |z   |z
  }||z  }	|	|fS )Nr2   r   )minr3   )r5   r&   maxr7   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r.   box_iourD   ]   s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29LAq!LAq$99E!T'NU"U*E
%-C:r0   c                 P   t        |      }t        |      }| j                  }|| j                  kD  r/|d|| j                  z
  z  z  }t	        j
                  | |      } | |fS || j                  dz
  k(  r|| j                  d   fz   }| |fS || j                  k  rt        d      | |fS )a%  Validate resize output shape according to input image.

    Args:
        image (`np.ndarray`):
         Image to be resized.
        output_shape (`iterable`):
            Size of the generated output image `(rows, cols[, ...][, dim])`. If `dim` is not provided, the number of
            channels is preserved.

    Returns
        image (`np.ndarray`):
            The input image, but with additional singleton dimensions appended in the case where `len(output_shape) >
            input.ndim`.
        output_shape (`Tuple`):
            The output shape converted to tuple.

    Raises ------ ValueError:
        If output_shape length is smaller than the image number of dimensions.

    Notes ----- The input image is reshaped if its number of dimensions is not equal to output_shape_length.

    )r3   r3   zIoutput_shape length cannot be smaller than the image number of dimensions)tuplelenshapendimnpreshape
ValueError)imageoutput_shapeoutput_ndiminput_shapes       r.   _preprocess_resize_output_shaperR   m   s    . &Ll#K++KUZZt{UZZ788

5+. , 


Q	&#u{{2&88 , 
uzz	!ghh,r0   c                 *   t        j                  |       }t        j                  |      r)t         j                  }t         j                  } ||       }n t         j                  }t         j
                  } ||       }t        j                  |||      }|S )a  Clip output image to range of values of input image.

    Note that this function modifies the values of *output_image* in-place.

    Taken from:
    https://github.com/scikit-image/scikit-image/blob/b4b521d6f0a105aabeaa31699949f78453ca3511/skimage/transform/_warps.py#L640.

    Args:
        input_image : ndarray
            Input image.
        output_image : ndarray
            Output image, which is modified in-place.
    )rK   r7   isnannanminnanmaxr8   clip)input_imageoutput_imagemin_valmin_funcmax_funcmax_vals         r.   _clip_warp_outputr^      sn     ff[!G	xx9999;'6666{#G77<':Lr0   c                       e Zd ZdZdgZdddddej                  dddf	dedee	e
f   ded	ed
eee	f   dededeee
ee
   f      deee
ee
   f      ddf fdZ	 	 ddej"                  deeeef      deeeef      fdZ	 	 	 	 ddej(                  d
eee	f   dedeeeef      deeeef      dej(                  fdZ e       dddddddddej.                  dfdeded	ed
eee	f   dede
dedeee
ee
   f      deee
ee
   f      deeeef      dedeeeef      dej6                  j6                  fd       Z	 d de
deeee   f   fdZd!dZ xZ S )"Owlv2ImageProcessorav  
    Constructs an OWLv2 image processor.

    Args:
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overriden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overriden by `rescale_factor` in the `preprocess`
            method.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to a square with gray pixels on the bottom and the right. Can be overriden by
            `do_pad` in the `preprocess` method.
        do_resize (`bool`, *optional*, defaults to `True`):
            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overriden
            by `do_resize` in the `preprocess` method.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 960, "width": 960}`):
            Size to resize the image to. Can be overriden by `size` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling method to use if resizing the image. Can be overriden by `resample` in the `preprocess` method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_STD`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
    pixel_valuesTgp?N
do_rescalerescale_factordo_pad	do_resizesizeresampledo_normalize
image_mean	image_stdreturnc
                     t        |   di |
 || _        || _        || _        || _        ||nddd| _        || _        || _        ||nt        | _
        |	|	| _        y t        | _        y )Ni  )heightwidth )super__init__rb   rc   rd   re   rf   rg   rh   r   ri   r   rj   )selfrb   rc   rd   re   rf   rg   rh   ri   rj   kwargs	__class__s              r.   rq   zOwlv2ImageProcessor.__init__   su     	"6"$," ,DS32O	 ((2(>*DT&/&;r0   rN   data_formatinput_data_formatc                 r    t        |      \  }}t        ||      }t        |d||z
  fd||z
  ffd||      }|S )ae  
        Pad an image to a square with gray pixels on the bottom and the right, as per the original OWLv2
        implementation.

        Args:
            image (`np.ndarray`):
                Image to pad.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        r   g      ?)rN   paddingconstant_valuesru   rv   )r   r8   r   )rr   rN   ru   rv   rm   rn   rf   s          r.   r   zOwlv2ImageProcessor.pad   sU    & 'u-65!'!TE\):;#/
 r0   anti_aliasingc                 (   t        | d       |d   |d   f}t        |t        j                        }t	        ||      \  }}|j
                  }	t        j                  |	|      }
d}d}d}|r|t        j                  d|
dz
  dz        }nt        j                  |      t        j                  |
      z  }t        j                  |dk        rt        d      t        j                  |dkD  |
dk  z        rt        j                  d	       t        j                   ||||
      }n|}|
D cg c]  }d|z  	 }}t        j"                  |||||d      }t%        ||      }t        ||t        j                        }|t        |||      }|S |}|S c c}w )a
  
        Resize an image as per the original implementation.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary containing the height and width to resize the image to.
            anti_aliasing (`bool`, *optional*, defaults to `True`):
                Whether to apply anti-aliasing when downsampling the image.
            anti_aliasing_sigma (`float`, *optional*, defaults to `None`):
                Standard deviation for Gaussian kernel when downsampling the image. If `None`, it will be calculated
                automatically.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the image. If not provided, it will be the same as the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred from the input
                image.
        scipyrm   rn   mirrorr   r3   r2   zFAnti-aliasing standard deviation must be greater than or equal to zerozWAnti-aliasing standard deviation greater than zero but not down-sampling along all axes)cvalmodeT)orderr   r~   	grid_mode)r!   r   r   LASTrR   rI   rK   dividemaximum
atleast_1d	ones_likeanyrM   warningswarnndigaussian_filterzoomr^   )rr   rN   rf   rz   anti_aliasing_sigmaru   rv   rs   rO   rQ   factorsndi_moder~   r   filteredfzoom_factorsouts                     r.   resizezOwlv2ImageProcessor.resize  s   : 	$(XW6+E3C3H3HI=e\R|kk))K6 "*&(jjWq[A4E&F#&(mm4G&H2<<X_K`&`#66-12$%pqqVV014AFGMMt **52EDW_`HH'./!A//hhxUPT`de!%-+E3DFVF[F[\R]Ri'{<MN 	  pu 	  0s   *Fimagesreturn_tensorsc           	         ||n| j                   }||n| j                  }||n| j                  }||n| j                  }||n| j                  }||n| j
                  }|	|	n| j                  }	||n| j                  }t        |      }t        |      }t        |      st        d      t        |||||	|       |D cg c]  }t        |       }}t        |d         r|rt        j!                  d       |t#        |d         }|r!|D cg c]  }| j%                  |||       }}|r |D cg c]  }| j'                  ||       }}|r!|D cg c]  }| j)                  |||       }}|r"|D cg c]  }| j+                  |||	|       }}|D cg c]  }t-        |||	       }}d
|i}t/        ||
      S c c}w c c}w c c}w c c}w c c}w c c}w )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to a square with gray pixels on the bottom and the right.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size to resize the image to.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rb   rc   rh   ri   rj   rf   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)rN   scalerv   )rN   rv   )rN   rf   rv   )rN   meanstdrv   )input_channel_dimra   )datatensor_type)rb   rc   rd   re   rh   ri   rj   rf   r   r   r   rM   r   r   r   loggerwarning_oncer   rescaler   r   	normalizer   r
   )rr   r   rd   re   rf   rb   rc   rh   ri   rj   r   ru   rv   rN   r   s                  r.   
preprocesszOwlv2ImageProcessor.preprocessO  sR   t $.#9Zt
+9+E4K^K^!-4;;!*!6IDNN	'3'?|TEVEV#-#9Zt
!*!6IDNN	'tTYYT"$V,F#:  	&!)%!	
 6<<E.'<<6!9%*s
 $ >vay I $ 5RcdF 
 ^deUZdhhU>OhPeFe $  &7  F   $ U^opF  ou
ej'{N_`
 
 '>BBU = f

s$   GG4GG9GG	thresholdtarget_sizesc                 N   |j                   |j                  }}|"t        |      t        |      k7  rt        d      t	        j
                  |d      }t	        j                  |j                        }|j                  }t        |      }|t        |t              rMt	        j                  |D 	cg c]  }	|	d   	 c}	      }
t	        j                  |D 	cg c]  }	|	d   	 c}	      }n|j                  d      \  }
}t	        j
                  |
|      }t	        j                  ||||gd      j                  |j                         }||dddddf   z  }g }t#        |||      D ]3  \  }}}|||kD     }|||kD     }|||kD     }|j%                  |||d       5 |S c c}	w c c}	w )a|  
        Converts the raw output of [`OwlViTForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`OwlViTObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model.
        NTMake sure that you pass in as many target sizes as the batch dimension of the logitsrF   dimr   r3   scoreslabelsr4   )logits
pred_boxesrH   rM   r&   r8   sigmoidvaluesindicesr   
isinstancer   Tensorunbindstacktodevicezipappend)rr   outputsr   r   r   r4   probsr   r   iimg_himg_wrf   	scale_fctresultsslbscorelabelboxs                        r.   post_process_object_detectionz1Owlv2ImageProcessor.post_process_object_detection  s   (  (:(:#6{c,// j  		&b)u||, )/ #,-L%Aqad%ABL%Aqad%AB+2215u 99UE*DT4t$<!DGGUIIaqj11E6651 	MGAq!a)m$Ea)m$EA	M"CNNeusKL		M ' &B%As   /FF"c                 j   |j                   |j                  }}|"t        |      t        |      k7  rt        d      ||j                  d   dk7  rt        d      t        j                  |d      }t        j                  |j                        }t        |      }|dk  rt        |j                  d	         D ]g  }	t        j                  ||	          D ]I  }
||	   |
   st        ||	   |
ddf   j                  d	      ||	         d	   d	   }d
||
<   d||	   ||kD  <   K i |t        |t              rMt        j                   |D 
cg c]  }
|
d	   	 c}
      }t        j                   |D 
cg c]  }
|
d   	 c}
      }n|j#                  d      \  }}t        j$                  ||||gd      j'                  |j(                        }||dddddf   z  }g }t        j*                  |      }t        |j                  d	         D ]  }	||	   }|j-                         j/                         s'd|||k  <   t        j                  |      dz   }||dz  z
  |dz  z  }t        j0                  |dd      }|||	<   ||	   d	kD  }||	   |   }||	   |   }|j3                  |d|d        |S c c}
w c c}
w )a  
        Converts the output of [`OwlViTForObjectDetection.image_guided_detection`] into the format expected by the COCO
        api.

        Args:
            outputs ([`OwlViTImageGuidedObjectDetectionOutput`]):
                Raw outputs of the model.
            threshold (`float`, *optional*, defaults to 0.0):
                Minimum confidence threshold to use to filter out predicted boxes.
            nms_threshold (`float`, *optional*, defaults to 0.3):
                IoU threshold for non-maximum suppression of overlapping boxes.
            target_sizes (`torch.Tensor`, *optional*):
                Tensor of shape (batch_size, 2) where each entry is the (height, width) of the corresponding image in
                the batch. If set, predicted normalized bounding boxes are rescaled to the target sizes. If left to
                None, predictions will not be unnormalized.

        Returns:
            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
            in the batch as predicted by the model. All labels are set to None as
            `OwlViTForObjectDetection.image_guided_detection` perform one-shot object detection.
        Nr   r3   r2   zTEach element of target_sizes must contain the size (h, w) of each image of the batchrF   r   g      ?r   g              gư>皙?g?r   )r   target_pred_boxesrH   rM   rI   r&   r8   r   r   r   rangeargsortrD   	unsqueezer   r   tensorr   r   r   r   
zeros_likenonzeronumelrW   r   )rr   r   r   nms_thresholdr   r   target_boxesr   r   idxr   iousr   r   r   r   alphasquery_scores	max_scorequery_alphasmask
box_scoresr4   s                          r.   #post_process_image_guided_detectionz7Owlv2ImageProcessor.post_process_image_guided_detection  s   ,  '~~w/H/H#Fs<7H(Hstt#(:(:1(=(Bstt		&b)u||, 0= 3\//23 <s|4 <A!#;q> "<#4QT#:#D#DQ#GVYIZ[\]^_`aD"DG8;F3K} 45<< #,-L%Aqad%ABL%Aqad%AB+2215uUE5%$@aHKKLL_L_`I')AtQJ*??L !!&)++A./ 	SC!#;L'')//1 69L	12 		,/$6I(IO<SQL ::lC=L&F3K#;?DT*J %d+ENNjD5QR'	S* A &B%As   	J+/J0)NN)TNNN)r   N)r   g333333?N)!__name__
__module____qualname____doc__model_input_namesr   BILINEARboolr   r,   r)   r   strr   r   rq   rK   arrayr   r   ndarrayr   r   FIRSTr   r   PILImager   r   r   r   __classcell__)rt   s   @r.   r`   r`      s   > ((  ,3#'9'B'B!:>9=QQ c5j)Q 	Q
 Q 38nQ %Q Q U5$u+#567Q E%e"456Q 
Q8 ?CDH	xx eC)9$9:; $E#/?*?$@A	F # >BDHAzzA 38nA 	A eC)9$9:;A $E#/?*?$@AA 
AF %& # $!:>9=;?(8(>(>DHCCCC CC 	CC
 38nCC CC CC CC U5$u+#567CC E%e"456CC !sJ!78CC &CC $E#/?*?$@ACC 
CC 'CCL _c9"'9=B:tTY{CZ=[9xQr0   r`   )5r   r   typingr   r   r   r   r   numpyrK   image_processing_utilsr	   r
   r   image_transformsr   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r    r!   r&   r   r|   r"   r   
get_loggerr   r   r/   r5   rD   rR   r^   r`   ro   r0   r.   <module>r      s    '  5 5  U U 
       $ 
		H	%GE  $N<p, pr0   