
    sg?                    
   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlmZ ddlmZ  e ej                   dd	            Zd
 Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd Zd ZdjdZd Zd Z d Z!d Z"dddZ#	 	 	 	 	 	 dkdZ$	 	 	 	 	 	 	 dldZ% G d d       Z& ee!      d"        Z'	 dmd#Z(dnd$Z)dddd%ddd&d'ejT                  d(ejT                  d)ejT                  d*eejT                     d+e+d,eeee   ee   ee   f      d-ee,   fd.Z-dddd%ddd&d'ejT                  d(ejT                  d)ejT                  d*eejT                     d+e+d,eeee   ee   ee   f      d-ee,   fd/Z. e       rd dl/Z/d dl0m1Z2 e/jf                  d0e2jh                  d1e2jh                  d2e2jh                  d3e2jh                  d4e2jh                  d5e2jh                  fd6       Z5e/jf                  d1e2jh                  d2e2jh                  d4e2jh                  d5e2jh                  d7e2jh                  f
d8       Z6d9 Z7d:d:dd%dd;d'ejT                  d<ejT                  d=ejT                  d*eejT                     d+e+d,eeee   ee   ee   f      fd>Z8dd%ddd?d(ejT                  d)ejT                  d*eejT                     d+e+d,eeee   ee   ee   f      d-ee,   fd@Z9e/jf                  dAe2jh                  dBe2jh                  fdC       Z:djdDZ;	 	 	 dodEejT                  dFejT                  dGejT                  dHeejT                     dIe<dJe+dKee<   fdLZ=e/jf                  dMe2jh                  dNe2jh                  dOe2jh                  dPe2jh                  dQe2jh                  dRe2jh                  d5e2jh                  fdS       Z>dTejT                  dUejT                  dVejT                  dWejT                  dXejT                  f
dYZ?e/jf                  dZe2jh                  d[e2jh                  dPe2jh                  d\e2jh                  dQe2jh                  dRe2jh                  d]e2jh                  d5e2jh                  fd^       Z@	 dpdTejT                  dUejT                  d_ejT                  d`ejT                  daejT                  dbejT                  d-e,dXejT                  dce+fddZAe/jf                  dee2jh                  dfe2jh                  dge2jh                  d1e2jh                  d2e2jh                  dhe2jh                  d4e2jh                  d5e2jh                  d7e2jh                  d\e2jh                  fdi       ZBydZ;dZ9dZ8dZ=dZ?dZAdZBy)q    N)	lru_cache)OptionalTuple)	warn_once)
has_triton   )get_meta*TORCH_SPARSE_BSR_SCATTER_MM_LRU_CACHE_SIZE   c                     | st        |      y N)
ValueError)condmsgs     K/var/www/html/venv/lib/python3.12/site-packages/torch/sparse/_triton_ops.pycheckr      s    o     c                 X    t        |j                  t        j                  k(  |  d       y )Nz@(): only BSR sparse format is supported for the sparse argument.)r   layouttorch
sparse_bsr)f_namets     r   check_bsr_layoutr      s'    		E$$$(RSr   c                 r    t        |j                  |k(  xr |j                  j                  dk(  |  d       y )Ncudaz9(): all inputs are expected to be on the same GPU device.)r   devicetype)r   r   r   s      r   check_devicer   !   s3    		F6qxx}}6(KLr   c           	      *   t        |j                         dk\  xr |j                         dk\  |  d|j                          d|j                          d       |j                  dd  \  }}|j                  dd  \  }}t        ||k(  |  d| d| d       y )Nr   zc(): all inputs involved in the matrix product are expected to be at least 2D, but got lhs.dim() == z and rhs.dim() == .zw(): arguments' sizes involved in the matrix product are not compatible for matrix multiplication, got lhs.shape[-1] == z( which is not equal to rhs.shape[-2] == )r   dimshape)r   lhsrhsmklkrns          r   check_mm_compatible_shapesr+   (   s    		Q)3779>(   #	{*<SWWYKq	J IIbcNEArIIbcNEB	
b(   "t#KB4q	Rr   c           	          t        |j                  |k(  xrD |j                  t        j                  t        j                  t        j
                  ft        | z   v |  d| d|j                   d       y )Nz\(): all inputs are expected to be of the same dtype and one of (half, bfloat16, float32) or z, but got dtype == r!   )r   dtyper   halfbfloat16floattuple)r   r   r-   additional_dtypess       r   check_dtyper3   9   sp    		5 	SGGZZ5?P8QQS( 33D2E FGG9A	'	r   c           	      x    t        |      dk(  sJ d fd}t         ||      |  d|d    d|d    d       y )	Nr   c                     | | dz
  z   S Nr    )vs    r   is_power_of_twoz(check_blocksize.<locals>.is_power_of_twoG   s    QK  r   c                 @    d}| D ]  }|dk\  xr  |      xr |} |S )NT   r7   )bres	blocksizer9   s      r   is_compatible_blocksizez0check_blocksize.<locals>.is_compatible_blocksizeJ   s8     	KI?Ay'AJsC	K 
r   z(): sparse inputs' blocksize (r   z, r   z;) should be at least 16 and a power of 2 in each dimension.)lenr   )r   r>   r?   r9   s      @r   check_blocksizerA   D   sX    y>Q! 
	*(01b1 OD 	Dr   c                 ^    t        | j                               dk7  r| j                         S | S )a  Return input as a triton-contiguous tensor.

    A triton-contiguous tensor is defined as a tensor that has strides
    with minimal value equal to 1.

    While triton kernels support triton-non-contiguous tensors (all
    strides being greater than 1 or having 0 strides) arguments, a
    considerable slow-down occurs because tensor data is copied
    element-wise rather than chunk-wise.
    r   )minstride
contiguous)r   s    r   make_triton_contiguousrF   X   s)     188:! ||~r   c                 r    	 t        j                  d |D         S # t        $ r t        d|  d       Y y w xY w)Nc              3   :   K   | ]  }|j                   d d   y wNr"   r$   .0r   s     r   	<genexpr>z'broadcast_batch_dims.<locals>.<genexpr>m   s     'F'F   Fz3(): inputs' batch dimensions are not broadcastable!)r   broadcast_shapes	Exceptionr   )r   tensorss     r   broadcast_batch_dimsrR   k   sB    U%%'Fg'FGG UexRSTUs    66c              '   l   K   |D ]+  }t        d       g|j                         z  }||| <   ||    - y wr   )slicer#   )r#   slice_rangerQ   r   slicess        r   slicerrW   r   s=      +(!sis   24c              '      K   |D ]B  }t        d       g|j                         z  }t        | |      D ]  \  }}|	|||<    ||    D y wr   )rT   r#   zip)dimsrV   rQ   r   sdd_slices          r   multidim_slicerr^   y   s^      4[MAEEG#dF+ 	JAw}!	 d
s
   7AAc               '   V   K   | D ]  }| |j                         E d {      y 7 wr   )rD   )rQ   r   s     r   ptr_stride_extractorr`      s,      88:s   )')c           
   #      K   dt               cxk  rdk  sJ  J dt              cxk  rdk  sJ  J dd l} fd}fd} |j                   |        D ]p  }t         |      D 	cg c]  \  }}}	t	        ||z
  |	       }
}}}	t        ||
      D cg c]  \  }}t        |||z          }}}|
d d d   g ||       r y c c}	}}w c c}}w w)Nr      c               3   T   K   t              D ]  \  } }t        d| |        y w)Nr   )rY   range)fgmg	full_gridgrid_blockss     r   generate_grid_pointsz.grid_partitioner.<locals>.generate_grid_points   s0     )[1 	#FB2r""	#s   %(c              3   n   K   j                         D ]  \  }}t        t        || |              y wr   )itemsnextr^   )rV   r   t_dimstensor_dims_maps      r   generate_sliced_tensorsz1grid_partitioner.<locals>.generate_sliced_tensors   s7     (..0 	;IAvvvq9::	;s   25)r@   	itertoolsproductrY   rC   rT   )rg   rh   rn   rq   ri   ro   
grid_pointre   gprf   gridgrV   s   ```          r   grid_partitionerrw      s     I#!#####K %A%%%%%#; (i'')=)?@ ;
/29j+/V
 
!+RCR
 
 25Z1FGA%BF#GG 4R4j:26:::;
 Hs   A*C/C
	CC5'Cc                     dd d d   }||}n!d t        fdt        ||      D              }t        |||      D ]  ^}} | |g|   y )N)i  ry   rp   c                 6    | |S t        dt        | |            S r6   )maxrC   )rv   rf   s     r   valid_grid_dimz%launch_kernel.<locals>.valid_grid_dim   s!    y	 1c!Rj))r   c              3   6   K   | ]  \  }} ||        y wr   r7   )rL   rv   rf   r|   s      r   rM   z launch_kernel.<locals>.<genexpr>   s!      
&+aN1b!
s   )r1   rY   rw   )kernelrn   rg   rh   cuda_max_gridru   sliced_tensorsr|   s          @r   launch_kernelr      sr    .tt4M#	*  
/2;/N
 
 "2;" &~ 	t%n%&r   c           
      8   | j                         j                  d      }| j                         j                  d      }t        | j	                         j                  d            }|D cg c]  }t        |j                  d             }}t        j                  |j                  d d gd |D         }d } |||d      } |||d      } ||||j                  dd        }|D cg c]  } ||||j                  dd         }}|||g|S c c}w c c}w )Nr   c              3   :   K   | ]  }|j                   d d   y wrI   rJ   rK   s     r   rM   z!prepare_inputs.<locals>.<genexpr>   s     ;aQWWSb\;rN   c                 b    | j                  ||z         j                  dt        |      dz
        S )Nr   r   )broadcast_toflattenr@   )r   
batch_dimsinvariant_dimss      r   batch_broadcast_and_squashz2prepare_inputs.<locals>.batch_broadcast_and_squash   s1    ~~j>9:BBs:"
 	
r   rp   r"   )crow_indices	unsqueezecol_indicesrF   valuesr   rO   r$   )	bsrdense_tensorsr   r   r   r   rQ   batch_dims_broadcastedr   s	            r   prepare_inputsr      s6   ##%//2L//#--a0K#CJJL$:$:1$=>F?LM!%akk!n5MGM #33Sb;7;

 .,eL -[:PRWXK'&RS(9F
  	#1&<aggbclKG 
 f6w667 N,s   +!D*Dc                    t        | |g| }|j                         j                  |dz         }|j                         j                  |dz         }|j	                         j                  ||j	                         j
                  dd  z         }||j
                  dd  z   }t        j                  |||||j                        S )Nr   r   r"   sizer   )	rR   r   r   r   r   r$   r   sparse_compressed_tensorr   )r   r   rQ   batch_shaper   r   r   r   s           r   broadcast_batch_dims_bsrr      s    &vs=W=K##%22;3FGL//#00u1DEKZZ\&&{SZZ\5G5G5L'LMF23'D))k6SZZ r   c                     | j                   ^ }}}|||d   z  |d   ||d   z  |d   gz   }| j                  |      j                  dd      S )Nr   r   r   r"   )r$   view	transpose)r   r>   restr'   r*   	new_shapes         r   tile_to_blocksizer      sd    ''KT1a	Yq\!	Yq\!	 I 66)&&r2..r   c                    | j                   dk  r!| j                  d      } | j                   dk  r!| j                   dkD  r| j                  d| j                   dz
        } | j                   dk(  sJ | j                         | S )zReturn tensor as 3D tensor by either prepending new dimensions to
    the tensor shape (when ``tensor.ndim < 3``), or by collapsing
    starting dimensions into the first dimension (when ``tensor.ndim >
    3``).
    rb   r   )ndimr   r   r$   )tensors    r   	as1Dbatchr      so     ++/!!!$ ++/{{Q6;;?3;;!)V\\)Mr   accumulatorsc                   |d   }| j                   dk(  sJ | j                  \  }}}|dk(  r|dd \  }}	|j                   dk(  sJ |j                  \  }
}}||k(  sJ |B|j                  d   dz
  }t        j                  |||f| j                  | j
                        }n|j                  \  }}}||k(  sJ ||k(  sJ |dz  s|dz  s|dz  st        ^t        |j                  d   dz
        D ]>  }||   }||dz      }t        ||      D ]   }|	|   \  }}||xx   | |   ||   z  z  cc<   " @ |S t        | |||	|       |S |dk(  r|j                  }t        |      }|j                  \  }}}||z  dk(  sJ |dd \  }}}}}|d	   }|^||j                         j                         dz   |z  z   } t        j                  g |dd
 | || j                  | j
                        }n|j                  d
d \  } }!|!|k(  sJ |j                  }"t        |      }||z  }|dz  s|dz  s|dz  st        |j                          t        |      D ]  }#t        |j                  d         D ]  }||   j                         }$||   j                         }||dz      j                         }t        |$|      \  }%}&||#|%|%|z   |&|&|z   f   }'t        ||      D ]D  }||   ||   }}t        |j                         |      \  }(})|'| |   ||#|(|(|z   |)|)|z   f   z  z  }'F   nt        | |||||||       |j                  |"      S |dk(  r1|j                  }t        |      }|j                  \  }}}||z  dk(  sJ |dd \  }}}}|d	   }|^||j                         j                         dz   |z  z   } t        j                  g |dd
 | || j                  | j
                        }n|j                  d
d \  } }!|!|k(  sJ |j                  }"t        |      }||z  }|dz  s|dz  s|dz  st        t        |      D ]  }#t        t        |            D ]  }*t        ||*   j                         |      \  }%}&|%|z  }+|&|z  },||+   j                         }-||+dz      j                         }.||#|%|%|z   |&|&|z   f   }'t!        t        |-|.            D ]Q  \  }/}||,|.z  ||,z
  |-z  z   |/z      j                         }t        ||      \  }(})|'| |   ||#|(|(|z   |)|)|z   f   z  z  }'S   n>t        j"                  d|j                  |j
                        }t        | |||||||       |j                  |"      S t%        |      )ad  Scattered matrix multiplication of tensors.

    A scattered matrix multiplication is defined as a series of matrix
    multiplications applied to input tensors according to the input
    and output mappings specified by indices data.

    The following indices data formats are supported for defining a
    scattered matrix multiplication operation (:attr:`indices_data[0]`
    holds the name of the indices data format as specified below):

    - ``"scatter_mm"`` - matrix multiplications scattered in batches
      of tensors.

      If :attr:`blocks` is a :math:`(* 	imes M 	imes K) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor,
      and :attr:`indices = indices_data['indices']` is a :math:`(*
      	imes 3)` tensor, then the operation is equivalent to the
      following code::

        c_offsets, pq = indices_data[1:]
        for r in range(len(c_offsets) - 1):
            for g in range(c_offsets[r], c_offsets[r + 1]):
                p, q = pq[g]
                accumulators[r] += blocks[p] @ others[q]

    - ``"bsr_strided_mm"`` - matrix multiplications scattered in
      batches of tensors and a tensor.

      If :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor,
      :attr:`others` is a :math:`(* 	imes K 	imes N)` tensor,
      :attr:`accumulators` is a :math:`(* 	imes M 	imes N)` tensor, then
      the operation is equivalent to the following code::

        c_indices, r_offsets, p_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for i, r in enumerate(r_offsets):
                r0, r1 = divmod(r, N)
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for g in range(c_indices[i], c_indices[i+1]):
                    p = p_offsets[g]
                    q0, q1 = divmod(q_offsets[g], N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

    - ``"bsr_strided_mm_compressed"`` - matrix multiplications
      scattered in batches of tensors and a tensor. A memory and
      processor efficient version of ``"bsr_strided_mm"`` format.  If
      :attr:`blocks` is a :math:`(Ms 	imes Ks) tensor, :attr:`others`
      is a :math:`(* 	imes K 	imes N)` tensor, :attr:`accumulators`
      is a :math:`(* 	imes M 	imes N)` tensor, then the operation is
      equivalent to the following code::

        c_indices, r_offsets, q_offsets, meta = indices_data[1:]
        for b in range(nbatches):
            for r in r_offsets:
                m = (r // N) // Ms
                n = (r % N) // Ns
                r0, r1 = divmod(r, N)
                c0, c1 = c_indices[m], c_indices[m + 1]
                acc = accumulators[b, r0:r0 + Ms, r1:r1 + Ns]
                for i, p in enumerate(range(c0, c1)):
                    q = q_offsets[n * c1 + (SPLIT_N - n) * c0 + i]
                    q0, q1 = divmod(q, N)
                    acc += blocks[p] @ others[b, q0:q0 + Ks, q1:q1 + Ns]

      where ``Ns = N // meta['SPLIT_N']``, and ``M`` and ``K`` are
      integer multiples of ``Ms`` and ``Ks``, respectively.

      Notice that the order of ``r_offsets`` items can be arbitrary;
      this property enables defining swizzle operators via
      rearrangements of ``r_offsets`` items..

    Auxilary functions are provided for pre-computing
    :attr:`indices_data`. For example,
    :func:`bsr_scatter_mm_indices_data` is used to define indices data
    for matrix multiplication of BSR and strided tensors.

    Parameters
    ----------
    blocks (Tensor): a 3-D tensor of first matrices to be multiplied

    others (Tensor): a tensor of second matrices to be multiplied. If
      ``indices_data[0]=="scatter_mm"``, the tensor is a 1-D batch
      tensor of second input matrices to be multiplied. Otherwise, the
      second input matrices are slices of the :attr:`others` tensor.
    indices_data (tuple): a format data that defines the inputs and
      outputs of scattered matrix multiplications.

    Keyword arguments
    -----------------

    accumulators (Tensor, optional): a tensor of matrix product
      accumulators. If ``indices_data[0]=="scatter_mm"``, the tensor
      is a 1-D batch tensor of output matrices. Otherwise, output
      matrices are slices of the :attr:`accumulators` tensor.
    r   rb   
scatter_mmr   Nr-   r   r;   bsr_strided_mmSPLIT_Nr"   bsr_strided_mm_compressed)r   )r   r$   r   zerosr-   r   _scatter_mm2rd   r   r{   item_scatter_mm6zero_divmodr   r@   	enumerateemptyNotImplementedError)0blocksothersindices_datar   indices_formatPMsKs	c_offsetspqQKs_NsRMs_Ns_rg0g1rv   pqothers_shapeBKN	c_indices	r_offsets	p_offsets	q_offsetsmetar   MN_accumulators_shaper<   r_r0r1accq0q1jr'   r*   c0c1is0                                                   r   r   r     s   H "!_N;;!IAr2%$QR(	2{{a\\
3Syy"Q&A ;;B6<<L ',,KAsC"99"997b2gbL,@9??1-12 =q\q1u%r2 =Aa5DAq Ovay6!9'<<O==  BE	+	+||6",,1a2v{{;G;K8	9iDy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EAr7N7)// .'\7b2gbL,@ 1X 
Qyq12 	QA"1**,B"1**,B"1q5)..0B#B]FB&q"rBw,R"W'DEC"2r] Q(|Yq\1!'!!4Bvay6!R"r'\2R<2O+PPPQ	Q
Q 	   !344	6	6||6",,1a2v{{0<QR0@-	9iy/immo**,q0Q66A ;;*,s#*Q**&,,v}}L !&&rs+EAr7N7)// .'\7b2gbL,@1X Qs9~. 
QA#IaL$5$5$7;FBbAbA"1**,B"1q5)..0B&q"rBw,R"W'DEC )%B- 8 Q1%a"f!r/A&AA&EFKKM!'1Bvay6!R"r'\2R<2O+PPPQ
QQ IOOI4D4DI 	   !344 ".11r   c           
      \   ||||	|
|hd hk(  r.t         j                  j                         }t        d| ||||f|dt         j                  df      }| |j
                  d$i | |S | ||fdk(  rU||fdk(  rd}d}d}d	}d}
d	}	n||fd
k(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d}d}
d	}	nx| ||fdk(  rU||fdk(  rd}d}d}d}d}
d}	n[||fd
k(  rd}d}d}d	}d}
d}	nF||fdk(  rd	}d}d}d	}d}
d	}	n1||fdk(  r)d}d}d}d	}d}
d	}	n| ||fdk(  rd||fdk(  rd	}d}d}d}d}
d}	n||fd
k(  rd}d}d}d}d}
d}	n||fdk(  rd}d}d}d	}d}
d}	n||fdk(  rd}d}d}d	}d}
d	}	n||fdk(  rd}d}d}d}d}
d	}	n| ||fdk(  rd||fdk(  rd	}d}d}d}d}
d}	n||fd
k(  rd	}d}d}d	}d}
d}	n||fdk(  rd	}d}d}d	}d}
d	}	nk||fdk(  rd}d}d}d	}d}
d	}	nW||fdk(  rPd	}d}d}d}d}
d	}	nC| ||fdk(  r;||fdk(  rd}d}d}d}d}
d}	n'||fd
k(  rd}d}d}d}d}
d}	n||fdk(  rd}d}d}d}d}
d	}	|(ddd	ddddddd	j                  |d      }|dk\  r|dk\  rd}||z  }|t        |dk  rdnd|      }|t        |dk  rdnd|      }|
xs d}
|	t        | |      dkD  rddddj                  |d	      }	n`t        | |      dk(  rddddj                  |d	      }	n:t        | |      dk(  rdd	dj                  |d	      }	ndddj                  |d	      }	|xs d	}||k  sJ t        ||             ||k  sJ t        ||             || k  sJ t        | |              ||k  sJ t        ||!             ||k  sJ t        ||"             t        d$||||
|	|d#|S )%Nr   r         ?version)   r   r   )r;   r;   r   r;      )    r   r   r   )@   r   )   r   )   r   r      r   r   )   r   r   )r   r   )   r   r   )   r   r   r   )	r;   r   r   r   r   r   r   r   i    r   r   r   )r;   r   r   )r;   r   )TILE_Mr   )TILE_Nr   )r   r   )r   r   )r   r   )r   r   
GROUP_SIZE
num_stages	num_warpsr   r7   )	r   r   get_device_namer	   float16updategetrC   dict)r   r   r   r   r   r   r   r   r   r   r   extradevice_namer   r   s                  r   scatter_mm_metar     s    	J
CvMjj0021b"s+	
 DKK % K q!9
"Bx8#

	bX%

	bX%

	bZ'

	AY*$Bx8#

	bX%

	bX%

	bZ'

	AY+%Bx8#

	bX%

	bX%

	bZ'

	bZ'

	AY+%Bx8#

	bX%

	bX%

	bZ'

	bZ'

	AY+%Bx8#

	bX%

	bX%

	 

 #a* 	 9dG	
gB~28RR0~28RR0qJq!9tA1-11"a8IAY$A1-11"a8IAY#A**2q1IA**2q1IqJR<3V33<R<3V33<7$D1$$77$D1$$77$D1$$7   r   c                    |t         j                  }|d}||	|
|hd hk(  rt         j                  j                         }| |||||dk(  |dk(  |dk(  f}t	        d|||||f      }||dk7  rt	        d||||df      }|it	        dg |d d d|dd  |||df      }t        |xs i       D ]8  }||   }|d   }|d	   }||z  }||z  dk(  s ||k  s&t        |      }||z  |d	<   : | |j                  di | |S t        d
| d|d|d|d|d|d|       |xs t        ||z  d      }|xs d}|
xs d}
|	xs d}	t        d|||
|	d|S )Nr   r   r   bsr_dense_addmmr   r   *rb   r   z@bsr_dense_addmm uses non-optimal triton kernel parameters for M=z K=z N=z Ms=z, Ks=z beta=z alpha=r   )r   GROUP_SIZE_ROWr   r   r7   )
r   r   r   r   r	   sortedr   r   r   r{   )r   r   r   r   r   betaalphar   r   r   r   sparsityr-   _versionr   r   keyr   matching_metamkeymeta_r*   split_ncs                           r   bsr_dense_addmm_metar	    s   ( }J7D6Ajj002!QB	419eqjAsK(E89T
 <HO!3hs=SD <$!)#bq')3)QR)!5#.	M }23 -%d+G	*Lq5A:!q&;D&'1fDO- DKK % K
 SQRPTTXVWUYY][\Z^^c`b_ddjgifkkrmqlss{uzt|} (Q"WaG#(qNqJQI %	
  r   c                   2    e Zd ZdZd Zd Zd Zed        Zy)TensorAsKeyaS  A light-weight wrapper of a tensor that enables storing tensors as
    keys with efficient memory reference based comparision as an
    approximation to data equality based keys.

    Motivation: the hash value of a torch tensor is tensor instance
    based that does not use data equality and makes the usage of
    tensors as keys less useful. For instance, the result of
    ``len({a.crow_indices(), a.crow_indices()})`` is `2`, although,
    the tensor results from `crow_indices` method call are equal, in
    fact, these share the same data storage.
    On the other hand, for efficient caching of tensors we want to
    avoid calling torch.equal that compares tensors item-wise.

    TensorAsKey offers a compromise in that it guarantees key equality
    of tensors that references data in the same storage in the same
    manner and without accessing underlying data. However, this
    approach does not always guarantee correctness. For instance, for
    a complex tensor ``x``, we have ``TensorAsKey(x) ==
    TensorAsKey(x.conj())`` while ``torch.equal(x, x.conj())`` would
    return False.
    c                 h   d }t        j                  |      | _        |j                  t        j
                  u r ||      | _        n|j                  t        j                  t        j                  hv r2 ||j                                ||j                               f| _        ns|j                  t        j                  t        j                  hv r2 ||j                                ||j                               f| _        nt        |j                        t!        | j                        | _        y )Nc                    | j                   j                  s| j                   j                  rJ | j                          | j                         | j	                         | j
                  | j                         | j                   fS r   )r-   is_floating_point
is_complexdata_ptrstorage_offsetr$   rD   )objs    r   get_tensor_keyz,TensorAsKey.__init__.<locals>.get_tensor_keyD  sb     		33syy7K7KWciiWL""$		

		 r   )weakrefref_obj_refr   r   stridedr  
sparse_csrr   r   r   
sparse_csc
sparse_bscccol_indicesrow_indicesr   hash_hash)selfr  r  s      r   __init__zTensorAsKey.__init__C  s    	&  C(::&%c*DHZZE,,e.>.>??s//12s01DH ZZE,,e.>.>??s//12s01DH
 &cjj11$((^
r   c                     | j                   S r   )r  r  s    r   __hash__zTensorAsKey.__hash__h  s    zzr   c                     t        |t              sy| j                  |j                  | |u S | j                  |j                  k(  S )NF)
isinstancer  r  r  )r  others     r   __eq__zTensorAsKey.__eq__k  sA    %-88uyy0 5= xx599$$r   c                 "    | j                         S )z'Return object if alive, otherwise None.)r  r"  s    r   r  zTensorAsKey.objt  s     }}r   N)	__name__
__module____qualname____doc__r   r#  r'  propertyr  r7   r   r   r  r  ,  s+    ,#$J%  r   r  )maxsizec	           	      B   |j                   }	|	J |	j                         |	j                         }}
|
j                  }t        j
                  }| dk(  r*||z  }g }t	        j                  |||      |z  }t        ||z        D ]o  }|
|   j                         }|
|dz      j                         }||k(  r2|j                  ||| ||z  z  j                  |      |j                  ||z
        z          q t	        j                  |      }|
j                         }|j                         }|||z  z  }||z   j                  d      }|
}||   j                  |      }|j!                  dd      \  }}||   }| |||fS | dk(  r||z  }g }g }t	        j                  |||      |z  }t        ||z        D ]  }|
|   j                         }|
|dz      j                         }||k(  r2|j                  t	        j                  ||||      j                  |             |j                  ||| ||z  z  j                  |      |j                  ||z
        z           t	        j                  |      }|
j                         }|j                         }|||z  z  }||z   j                  d      }t	        j                  |
d d t	        j"                  ||   j                  |      d      f      }t	        j                  |      }| ||||fS | d	k(  r|}dg}g }t        |      D ]  }t        ||z        D ]  }|
|   j                         }|
|dz      j                         }t        ||z        D ]l  }|j                  |d   |z   |z
         t        ||z
        D ]?  } || z   }!||!   j                         |||z  z  z   ||z  z  |z   }"|j                  |!|"g       A n   | t	        j$                  |||      t	        j$                  |||      fS t'        d
| d      )Nr   r   r   rp   T)
descendingstabler   r   r   zInvalid indices_format=z>. Expected bsr_strided_mm_compressed|bsr_strided_mm|scatter_mm)r  r   r   r   r   int32arangerd   r   appendrepeatrepeat_interleavecatdiffnonzeror   sortcumsumr   r   )#r   r   r   r   r   r   nbatchesr   compressed_sparse_tensor_as_keyr   r   r   r   indices_dtyper   q_offsets_lstr<   r'   r   r   r   crow_indices_diffnon_zero_row_indicesar   r   nnz_per_rowindicesp_offsets_lstr   
pq_offsetsr*   r   r   r   s#                                      r   _bsr_scatter_mm_indices_datarG  z  s    *
-
-C?? # 0 0 2COO4E+L  FKKM44'\LLfEJqBw 	Aa%%'Ba!e$))+BRx  R#rAv.66w?%%b2g./	 IIm,	(--/088: BF+ULL$	 	'(<=OOPWX*//4/MWg&		9i@@	+	+'\LLfEJqBw 	Aa%%'Ba!e$))+BRx  R=HOOPWX   R#rAv.66w?%%b2g./	 IIm,	(--/088: BF+ULL$	IIRa %&:;MMgV
	 IIm,		9iKK	<	'C	
x 		2A17^ 2!!_))+!!a%(--/qBw 2A$$Yr]R%7"%<="27^ 2F(^002Q!r']BqBwORSS"))1a&1222		2 LL-GLL=H
 	
 &~''ef
 	
r   c                    | j                         dk(  sJ | j                  dk(  sJ | j                         }| j                         }| j	                         j
                  dd }| j
                  \  }}|\  }	}
|j
                  dd \  }}||k(  sJ |j
                  dd j                         }t        ||||	|
fi |}d|vr<|j                  | j                  t        j                  t        j                  hv        |d   }t        |||||	|
||t        |       	      }|dk(  r|j                  d	
       ||fz   S |dk(  r|j                  d
       ||fz   S |S )zkComputes indices data for :func:`scatter_mm` used in BSR and
    strided tensor matrix multiplication.
    r   r   r"   N
allow_tf32rI  r   r   T)is_compressedr   F)	dense_dimr   r   r   r   r$   numelr   r   r-   r   r   r/   rG  r  )r   r&  r   
meta_inputr   r   r>   r   r   r   r   K_r   r<  r   r   r   s                    r   bsr_scatter_mm_indices_datarP    sf    ==?a88q==##%L//#K

""23'I99DAqFBKKEB7N7{{3B%%'H1aB9j9D:%syyU]]ENN,KKL9oG/1aR7K<LL 44$'tg%%	+	+%(tg%%r   c           
      0   | j                   dk(  sJ |j                   dk\  sJ | j                  d   | j                  d   |j                  d   }}}| j                         j                  dd }|t        | |d      }|d   }|@t	        j
                  g |j                  dd ||| j                  | j                        }|j                  }	t        |      }| j                         dk(  r|j                          n|d	v r/|j                          t        | j                         |||
       nT|dk(  rC|j                  dd j                         }
t	        j                  |
|z  |d   z  |z  |d   z  |d   |d   f| j                  | j                        }t        |      j                  dd      j                  |
||d   z  |d   ||d   z  |d         j!                  dd      j#                  dd      }t        | j                         |||
       |j%                  |j'                  d|
||d   z  ||d   z  f      j!                  dd      j)                  |
||      j                  dd             nt+        |      |j                  |	      S )zBSR @ strided -> stridedr   r"   rp   Nr   )r   r   r   >   r   r   r   r   r   )rb   r   r   r   )r   r   rb   r   )r   r$   r   rP  r   r   r-   r   r   _nnzr   r   rM  r   r   r   movedimr   copy_	unflattenreshaper   )r   r&  r   outr   r   r   r>   r   	out_shaper<  r   r   s                r   bsr_scatter_mmrY    s    88q==::??2		"u{{2BB

""23'I2'B
 "!_N
{kk'ekk#2''B'syy
 		I
C.C
xxzQ			J	J		3::<3G	<	';;s#))+{{21-2ilB!!
 ))::
 eYr2Til"!il"! Wl WQ] 	 	3::<LQ		""HbIaL0"	!2DE Wl WXr2&Yr2		
 ".1188Ir   Fr   r   rW  skip_checksmax_gridr   inputr   denserW  r[  r\  r   c                p   ||j                   t        j                  u rd}	|j                         }
|
j	                         dz
  }|j
                  |   }|j
                  d   }t        |	||      }t        j                  |||fz   t        j                  |j                        }t        | ||||||||	      S )N_int_bsr_dense_addmmr   rp   r   rZ  )r-   r   int8r   r#   r$   rR   r   r2  r   r   )r]  r   r^  r   r   rW  r[  r\  r   r   r   
batch_ndimr   r   original_batch_dims_broadcasteds                  r   r`  r`  E  s     {u{{ejj0''')!%%'!+
IIj!KKO*>vsE*R'kk+q!f4++<<

 
 
r   c                   ! d}	|j                         }
|j                         }|j                         }|j                         dz
  }|j                  ||dz    \  }}|
j                  |dz   |dz    }|j                  d   }|#t        |	||      }|j                  |||fz         }|j                         dk(  sdk(  s|dk(  s
|dk(  s|dk(  r@dk(  r|j                          |S |j                  |        dk7  r|j                         |S St        d|j                         |d   z  |d   z  ||z  z  z
  d      }t        ||||d   |d   ||j                  	      |}t        || ||      \  }}}
} }}|\  j                  d|z        }||z   |}t!        | f      }t!        | f      }t!        |  f      } t"        j$                  t&        j(                  t"        j*                  t&        j(                  t"        j(                  t&        j,                  t"        j,                  t&        j,                  t"        j.                  t&        j0                  t"        j0                  t&        j0                  i|j                     !|j3                  d      }|j3                  d      dz
  }|j3                  d	      }|||f}|*t5        |d d d d d         d
dt7        |d d       z
  z  z   }nd }|
d|d|d| d|d|di}dk7  sJ  !fd}t9        ||||       |j;                         |j;                         k7  r*|j                  |j=                  |j                               |S )Nr   r   r   rb   rp   r   )r  r-   r   r   r   r   NNr   Nrp   )r   r   )r   r   Nc                     t        |    g t        | dk(  dk7  dk(  t        j                  k(  d y )Nr   r   )beta_is_onebeta_is_nonzeroalpha_is_oneBLOCKSIZE_ROWBLOCKSIZE_INNERBLOCKSIZE_COLrI  	acc_dtype)_bsr_strided_addmm_kernelr`   tlfloat32)	ru   r   BKBMBNr   r   dot_out_dtyper   s	     r   r~   zbsr_dense_addmm.<locals>.kernel  sm    !$' 	
!>2	
	
 	
 	 AI!$

2#	
 	
r   )r   r   r   r#   r$   rR   	new_emptyrR  r   rT  mul_roundr	  r-   r   r   r   r   r   rq  rr  r/   float64ra  r2  r   r1   r@   r   r  r   )"r]  r   r^  r   r   rW  r[  r\  r   r   r   r   r   rb  r   r   r>   r   rc  r  
out_backupr   out_untiled	n_batchesn_block_rowsn_block_colsrg   rh   rn   r~   rs  rt  ru  rv  s"      ``   `                     @@@@r   r   r   j  sh    FZZ\F##%L//#K!!#a'J99Z*q.1DAqZ!^j1n=IBA {*>vsE*R'oo=AFG
xxzQ%1*Q!q&AF19IIK
 
 IIeqy
|SXXZ)A,61EQOOQRS#aLaL))

 J;IUE3<8L+vueS FBhhy!r'*G	
gBK
C"b
*Ceb"X.Eeb"X.E 	rzz

rzzrzz

BHHRXX 
iiM 

1I$$R(1,L::b>LL,7IHRaL2./'QXbq\AR=R2SS 	m_{}[O A::
 
  &/9kB
||~,,.. 	))**:*:;<r   IS_BETA_ZEROrl  rn  TILE_Kro  rI  c            
         t        j                  d      } t        j                  d      }!||| z  z   ||!z  z   }"t        j                  |"      }#t        j                  |"|z         }$|$|#z
  }%|%dk(  ry t        j                  d|      }&t        j                  d|      }'||| z  z   |	|#z  z   |
|&d d d f   z  z   ||'d d d f   z  z   }(||| z  z   ||#z  z   })||| z  z   ||!z  z   ||&d d d f   z  z   }*||| z  z   ||'d d d f   z  z   }+t        j                  d|      },t	        |%      D ]0  }-t        j
                  ||f|      }.t        j                  |)      }/t	        d||      D ]  }0|0|,z   }1|1|k  }2t        j                  |*||1d d d f   z  z   |2d d d f   d      }3t        j                  |+||/z  z   ||1d d d f   z  z   |2d d d f   d      }4|.t        j                  |3|4||      z  }. |r|.| z  }.n| |.z  |t        j                  |(      z  z   }.t        j                  |(|.j                  |j                  j                               |(|	z  }(|)|z  })3 y )Nr   axisr   r-           maskr&  rI  	out_dtype)rq  
program_idloadr3  rd   r   dotstoretor-   
element_ty)5r   r   r  rl  rn  kr  
values_ptrvalues_batch_stridevalues_nnz_stridevalues_row_block_stridevalues_col_block_stridecrow_indices_ptrcrow_indices_batch_stridecrow_indices_stridecol_indices_ptrcol_indices_batch_stridecol_indices_stridemat1_ptrmat1_batch_stridemat1_tiled_row_stridemat1_tiled_col_stridemat1_row_block_stridemat1_col_block_stridemat2_ptrmat2_batch_stridemat2_tiled_row_stridemat2_tiled_col_stridemat2_row_block_stridemat2_col_block_stridero  rI  	batch_pidrow_block_pidcrow_indices_offset_ptr
nnz_offsetnnz_offset_nextrow_nnzrow_block_arangecol_block_arangevalues_block_ptrscol_index_nnz_ptrmat1_block_ptrsmat2_block_ptrsk_tile_arange_	acc_block	col_blockk_tile	k_offsetsmask_k
mat1_block
mat2_blocks5                                                        r   _sampled_addmm_kernelr    s
   F MMq)	1- ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	 &23 :-. 	 )+,#m34 $&6q$w&??@ 	 )+,#&6tQw&??@ 	 		!V,w &	4A-!?yQI  12I1f- "]2	"QWW#&;ia>P&PPa
  WW#+i78+i4.@@A  4
 RVV
zY 	%, U"	!I-rww?P7Q0QQ	 HH&	Z5E5E5P5P(QR !22!33M&	4r   r   c                    t        j                  d      }t        j                  d      }t        j                  d      }t        j                  d      }t        j                  d      } t        j                  |||| |      \  }}|||z  z   ||z  z   }!t        j                  |!      }"t        j                  |!|z         }#|#|"z
  }$|$dk(  ry t        j
                  d|      }%t        j
                  d|      }&| ||z  z   ||"z  z   ||%d d d f   z  z   ||&d d d f   z  z   }'|||z  z   ||z  z   ||&d d d f   z  z   ||%d d d f   z  z   }(|||z  z   ||z  z   ||z  z   ||%d d d f   z  z   ||%d d d f   z  z   })||	|z  z   |
|"z  z   }*t        j                  ||f|      }+t        |$      D ]m  },t        j                  |'      }-t        j                  |*      }.t        j                  |(||.z  z         }/|+t        j                  |-|/||      z  }+|'|z  }'|*|
z  }*o t        j                  |)|+j                  |j                  j                               y Nr   r  r   r   r  r  )rq  r  num_programs	swizzle2dr  r3  r   rd   r  r  r  r-   r  )0r  r  r  r  r  r  r  r  r  r  r  	dense_ptrdense_batch_stridedense_tiled_row_stridedense_tiled_col_stridedense_row_block_stridedense_col_block_stride
output_ptroutput_batch_strideoutput_tiled_row_strideoutput_tiled_col_strideoutput_row_block_strideoutput_col_block_striderl  rn  ro  rI  r   r  r  col_block_pidr~  r  r  r  r  r  r  r  r  dense_block_ptrsoutput_ptrsr  output_acc_blockr  values_blockdense_row_idxdense_blocks0                                                   r   "_bsr_strided_dense_rowspace_kernelr  d  s   \ MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.a<99Q699Q6 !I-.*,- &(8D(AAB &(8q(AA	B 	  9,-$}45 %'74'@@A %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 88]M$B)Tw 	4A77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#	4( 	.11*2B2B2M2MNOr   c           
          |j                  d      }|j                  d      dz
  }||f}|*t        |d d d d d         ddt        |d d       z
  z  z   }nd }|d|d|d|	d|
di}|j                  t        j
                  t        j                  fv rt        j                  d	nt        j                  d
 fd}t        ||||       y )Nr   rp   r   r   r   )r   N)r   rp   )r   rg  TFc                 L    t        |    g	t        | ddd y )Nr   r   )ro  rI  r   r   )r  r`   )
ru   r   ro  rI  r   r   r>   is_beta_zeror  tile_ks
     r   r~   z)_run_sampled_addmm_kernel.<locals>.kernel  sY    !$' 	
   &~6 $%r   )r   r1   r@   r-   r   r.   r/   rq  rr  rz  r   )r   r   r  r>   r  r  r   r   r   mat1mat2r\  r}  r~  rg   rh   rn   r~   ro  rI  s   ``````            @@r   _run_sampled_addmm_kernelr    s     KKN	#((,q0-	!TrT 23gSRTSTEVAV6WWKKI'')
 <<EJJ77

IJ

IJ	 	 	foy+Fr   g      ?)r   r   rW  r[  r\  r  r  c                   d}t        ||        t        || ||      }	|st        ||| j                         t        ||| j                         |dk7  r.| j                  t
        j                  u rt        d| d| d       | j                  t
        j                  ur/t        ||| j                         t        ||| j                         nt        |||j                         t        |||       |t        ||       t        |||j                         t        ||| j                         t        |j                  |	j                  k(  xr! |j                         | j                         k(  | d|	j                   d|	j                          d|j                   d	|j                          	       ||	j                  |j                  d
      }n|j                  |	       |j                         dk(  s|j                         dk(  r|S |j                         j                  dd  }
|j!                  d      }|j!                  d      }|j!                  d      }|dk(  s|dk(  r!|j                         j#                  |       |S |}t%        |||      \  }}}}}t'        ||
d   |f      }t'        |||
d   f      }t)        |
 }t+        |||dk(  |
||||||||       |j                         j-                         dd  |j-                         dd  k7  rF|j                         j                  |j/                  |j                         j                               |S )Nsampled_addmmr  Fz(): having beta == z3 not equal to 0.0 with boolean mask is not allowed.z!(): Expects `out` to be of shape z and with nnz equal to z but got out.shape = z and out.nnz = T)copyr   r"   rp   r   r   )r   r   r   r   r-   r   boolr   r3   r+   r$   rR  r  rT  rM  r   r   rx  r   r   r{   r  rD   rV  )r]  r  r  r   r   rW  r[  r\  r   input_broadcastedr>   r'   r*   r  r{  r   r   r   r  s                      r   r  r    s    !'4VUD$Ou||4u||4s{u{{ejj8h1$7jk {{%**,FD%++6FD%++6FD$**5&vtT: -VS$++6FC5II!2!8!88WSXXZ5::<=Wh?@Q@W@W?X Y-->-C-C-E,F G++.99+_SXXZLR ;#&&tzz&=CII'(99;!sxxzQJJJL&&rs+	IIbMIIbMIIbM C<16JJLd#J 
8FsDRV8W5k64 	!a'89 9Q<'89i!CK	
$ %%',0DD%%fnnZ5F5F5H5N5N&OPr   )rW  r[  r\  r   c                @   d}| j                   dd  \  }}|st        ||        t        || |j                         t	        || |j
                  t        j                  f       t        || |       |j                  d      }	| j                         j                   dd  \  }
}t        ||
|f       t        |	dz   | d|	 d       n|j                   dd  \  }}	t        || |      }|o|sm|||	fz   }t        |j                   |k(  d| d|j                    d	       t        |j                         xs  |j                  dd      j                         d
       ||j!                  |||	fz         }| j#                         dk(  r|j%                         S t'        || |dd|      S )Nbsr_dense_mmr"   rp   r;   z(): dense.size(-1) == z should be divisible by 16z9bsr_dense_mm(): `out` argument has wrong shape, expected z
, but got r!   zbsr_dense_mm(): only row-major/col-major `out` arguments are supported, i.e. (out.is_contiguous() or out.transpose(-2, -1).is_contiguous()) should be True.r   r   )r   r   rW  )r$   r   r   r   r3   r-   r   ra  r+   r   r   rA   r   rR   is_contiguousr   rw  rR  r   r   )r   r^  rW  r[  r\  r   r   r'   r(   r*   	row_blockr  r)   rc  expected_out_shapes                  r   r  r  v  s     		"#2VS)ell3U[[5::-@&vsE:

2A#&::<#5#5bc#: IyFY	$:;F
(03MN
 KK$EB*>vsE*R'?;!@Aq6!I		//./z#))AG
 !!#Ls}}R'<'J'J'L" ;//"AQF"JKC 88:?99; sCaaSIIr   MAX_ROW_NNZTILEc                    t        j                  d      }t        j                  d      }t        j                  d      }| ||z  z   ||z  z   }t        j                  |      }t        j                  ||z         }||z
  }|dk(  ry t        j                  d|
      }|||z  k  }|||z  z   ||z  z   ||z  z   }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  |d      }t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  |d      }t        j                  ||kD  ||      } t        j                  ||z
        }t        j                  |d      }t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  ||z
        }|t        j                  |d      z  } t        j                  ||z   ||z  j                  |j                  j                        |       t        |
|	|
      D ]  }||
z  }|||z  k  }t        j                  ||z   |t	        d             j                  t         j                        }t        j                  ||z
        }t        j                  ||z   ||z  j                  |j                  j                        |        y )Nr   r  r   r   infr  )r  )rq  r  r  r3  r0   r  rr  r{   rd   whereexpsumr  r-   r  )r  r  r  r  r  r  values_nnz_col_block_strider  r  r  r  r  row_block_offset_pidr  r  r  r  r  
row_aranger  curr_row_values_ptrsrow_tilemax_row_valuer  curr_max_row_valuenumdenoms                              r   _bsr_softmax_kernelr    sJ    MMq)	!}}!41- ')34!M12 	 
 WW45
''"9<O"OP "J.a<YYq$'
Gi// !I-.%(<<= 9$% 	 77 :-Du

"RZZ. 	 xa0t[$/ 		A$J) 33Dww$z1U5\Mbn  "$q!9HH 22MCUM		 ffX-.s#t[$/ 	)A$J) 33Dww$z1U5\Mbn  &&M12CRVVCa((E	) 	 :-5[Z--889	

 t[$/ 	A$J) 33Dww$z1U5\Mbn  &&M12CHH$z1u  !1!1!<!<=	r   c                 *   d}t        ||        t        || | j                         | j                         dk(  s| j	                         dk(  r| j                         S | j                  dd  \  }}| j                         }| j                         j                  dd  \  t        j                  |      nt        j                        | j                         j                  d      j                  dd      }| j                         j                  dd      j                         r| j                         j                         }n| j                         }|j                  dd      j                         j                  d      j                  dd      j!                  d|z        }|j                  d   |z  f}d }	|dd df   d|d	i}
fd
}t#        ||
||	        |j!                  d|      j                  dd      j                   | j                         j                   }t%        j&                  | j                         j                         | j)                         j                         || j                  | j*                        S )Nbsr_softmaxr   r"   r   rg  rp   .rf  re  c                 R    t        |    g t        | t        d        y )Ni   )r  r`   rC   )ru   r   r  max_row_nnzr  s     r   r~   zbsr_softmax.<locals>.kernel3  sJ    % %~6  	 E;'r   r   )r   r3   r-   rR  rM  cloner$   r   tritonnext_power_of_2r   r   r   r   r  rE   rV  r   r   r   r   r   )r]  r  r   r'   r*   nnzr   r   rg   rh   rn   r~   r  r  s    `          @@r   r  r    s6   'FE5;;/::<1 2;;= {{231jjl$||~33BC8	9 003K 00=K))+55a8@@BG <<>##B+99;\\^))+F\\^FR$Z\Yq\WQ^WRC)O4 	 \\!_ii@	 crc"MO	
	 	foy+FFNN2y#y9Yr2Welln**, 	 -- &&(%%'<<
 	
r   queryr  value	attn_mask	dropout_p	is_causalscalec           	          d}t        | | d       t        |d u| d       |J t        |j                  t        j                  k(  | dt        j                   d|j                   d       t	        ||| j
                         t	        ||| j
                         t	        ||| j
                         t        ||| j                         t        ||| j                         |j                  t        j                  urt        ||| j                         t        || |j                  dd      d	d
      }|| j                  d      dk(  s|d	k(  rt        d
| d| d       |'dt        j                  | j                  d            z  n|}	|j                         j                  |	       t!        |      }t        j"                  j$                  j'                  |j                         |d       t)        ||      }|S )N_scaled_dot_product_attentionz'(): is_causal == True is not supported.z'(): attn_mask == None is not supported.z(): attn_mask.layout must be z, but got attn_mask.layout == r!   r"   rp   r  F)r   r[  r   z(): current value of scale == z results in division by zero.r   T)r   inplace)r   r   r   r   r   r   r3   r-   r  r  r   r   mathsqrtr   rx  r  nn
functionaldropoutr  )
r  r  r  r  r  r  r  r   sdpascale_factors
             r   r  r  M  s    1)mx'NOPit#x/V%WX$$$ 0 00h ((-(8(8'9 :##,#3#3"4A7	
 	VS%,,/VUELL1VY5FC-FE5;;/??%**,	5;;7ucmmB3#5
 =UZZ^q0ESL(8 @/ /
 9>q499UZZ^445<(4 ##DKKMY#MD%(r   r   r   r   rv  r   r   c                 .   | |z  }||z  }t        j                  d      }t        j                  d      }||z  }||z  }||z  t        j                  d|      z   }||z  t        j                  d|      z   }t        j                  d|      } ||d d d f   |z  | d d d f   |z  z   z   }!|| d d d f   |	z  |d d d f   |
z  z   z   }"t        j                  |||z  z         }#t        j                  ||dz   |z  z         }$|#|$k(  ry t        j                  ||f|      }%t        |#|$      D ]  }&t        j                  ||&|z  z         }'t        j                  ||&|z  z   |z         }(t        j                  |!|'|z  z         })t        j                  |"|(|z  z         }*|%t        j                  |)|*||      z  }% |||z  z   |d d d f   |z  |d d d f   |z  z   z   }+t        j                  |+|%j                  |j                  j                               y Nr   r  r   r  )r  rI  )rq  r  r3  r  r   rd   r  r  r  r-   r  ),r   r   r   
blocks_ptrblocks_stride_Pblocks_stride_Mblocks_stride_K
others_ptrothers_stride_Qothers_stride_Kothers_stride_Naccumulators_ptraccumulators_stride_Raccumulators_stride_Maccumulators_stride_Npq_offsets_ptrpq_offsets_stridepq_ptrpq_stride_Tpq_stride_1rv  r   r   rI  r   r   pid_tpidpid_mpid_nrmrnrkA_ptrB_ptrr   r   r  r   r   r   Ar   C_ptrs,                                               r   _scatter_mm2_kernelr/  {  s5   6 &[&[1%mm#r	bV^bii622V^bii622YYq!_q$wK/)BtQwK/,II
 q$wK/)BtQwK/,II
 WW^e.?&??@WW^uqy4E&EEF8HHff-]C	r2 	VA[01A[0;>?AO 334AO 334A1*UUI	V ++, 1d733T1W+ 556 	 		%5%;%;%F%FGHr   r   r   rF  
pq_indicesr   c                    | j                   \  }}|j                   \  }}|j                   \  }	}}t        t        ddz        t        ddz        dd      }
fd}t        j                  t
        j                  t        j                  t
        j                  t        j                  t
        j                  t        j                  t
        j                  i|j                     }d|
vr#|
j                  |t
        j                  k(         t        |   || | j                  d	      | j                  d      | j                  d      ||j                  d	      |j                  d      |j                  d      ||j                  d	      |j                  d      |j                  d      j                  d	      ||j                  d	      |j                  d      fd
|i|
 y )Nr;   r   r   r   )r   r   r   r   c                     j                   d   dz
  t        j                  | d         t        j                  | d         z  dfS )Nr   r   r   r   r$   r  cdiv)METAr   r   rF  s    r   ru   z_scatter_mm2.<locals>.grid  sI      #a'AtH~.QX1OO r   rI  rJ  r   rv  )r$   r   r{   r   r   rq  rr  r/   rz  r-   r   r/  rD   )r   r   rF  r0  r   r   r   r   r  r   r   ru   rv  r   r   s     `          @@r   r   r     s    ,,1a,,1a$$1ar16?3r16?qTU
	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?D!MM!MM!MM!MM!MM!MM!"""a a a )	
* (+	
, -	
r   r   rK  r   r   c                    ||z  }||z  }||z  }t        j                  d      }t        j                  d      }|| z  } || z  }!||z  }"||"z  }#|#|z  }$t        ||$z
  |      }%|$||%z  z   }&||"z  |%z  }'|&|z  t        j                  d|      z   }(|'|z  t        j                  d|      z   })t        j                  d|      }*||(d d d f   |z  |*d d d f   |z  z   z   }+|| |	z  z   |*d d d f   |
z  |)d d d f   |z  z   z   },t        j                  ||!z         }-|rW|-|z  |z  }.|-|z  |z  }/t        j                  ||.z         }0t        j                  ||.z   dz         }1|/|1z  ||/z
  |0z  z   }2|1|0z
  }3n8t        j                  ||!z         }2t        j                  ||!z   dz         }4|4|2z
  }3||2z   }5t        j
                  ||f|      }6|r|+0|z  z  }+t        |3      D ]j  }7t        j                  |5      }8t        j                  |,|8z         }9t        j                  |+      }:|6t        j                  |:|9||      z  }6|+|z  }+|5dz  }5l n||2z   };t        |3      D ]  }7t        j                  |5      }8t        j                  |,|8z         }9t        j                  |;      }<t        j                  |+|<|z  z         }:|;dz  };|5dz  }5|6t        j                  |:|9||      z  }6 ||-z   | |z  z   |(d d d f   |z  |)d d d f   |z  z   z   }=t        j                  |=|6j                  |j                  j                               y r  )rq  r  rC   r3  r  r   rd   r  r  r  r-   r  )>r<  r   r   r   r  r  r  r  r  others_stride_Br  r  r  accumulators_stride_Br  r  c_indices_ptrr_offsets_ptrp_offsets_ptrq_offsets_ptrrK  rv  r   r   r   r   rI  r   BLOCKS_MBLOCKS_Npid_t_r%  pid_br$  num_pid_in_groupgroup_idfirst_pid_mgroup_size_mr&  r'  r(  r)  r*  r+  r,  r   r'   r*   r   r   r   r   r   q_ptrr  r  r   r   r-  p_ptrr   r.  s>                                                                 r   _scatter_mm6_kernelrG    s   < '\<<A&mm#!("%0**+8k1:>s\12''L8V^bii622V^bii622YYq"q$wK/)BtQwK/,II
 o%&!T'{_,r$'{_/LLN 	 GGME)*aBAQ2A*+B*Q./BR7Q;",,Br'C./B.23Br'C"HHff-]C	R/))E3Z GGENGGEAI&GGENRVVqMj 	 (
 "B&E3Z 	GGENGGEAI&GGENGGEA$778

RVVqMj 		 ++, 1d733T1W+ 556	 	 		%5%;%;%F%FGHr   r   r   r   r   force_contiguousc	                    |d   }	| j                   \  }
}|j                   \  }}|j                   \  }}}||k(  sJ ||	z  |k(  sJ fd}t        j                  t        j                  t        j
                  t        j                  t        j                  t        j                  t        j                  t        j                  i|j                     }d|vr#|j                  |t        j                  k(         |j                  d      dk(  sJ j                  d      dk(  sJ |j                  d      dk(  sJ |j                  d      dk(  sJ |rD| j                         } |j                         }|j                         s|j                         }n|}n|}t        |   ||| | j                  d      | j                  d      | j                  d      ||j                  d      |j                  d      |j                  d      ||j                  d      |j                  d      |j                  d      |||fd|i| |r#|j                         s|j                  |       y y y )	Nr   c                     j                   d   z  t        j                  | d         t        j                  | d         z  fS )Nr   r   r   r3  )r5  r   r   r   r   s    r   ru   z_scatter_mm6.<locals>.grid  sD    "Q&BX/&++b$x.2QQ r   rI  rJ  r   r   r   rv  )r$   r   r   rq  rr  r/   rz  r-   r   rD   rE   r  rG  rT  )r   r   r   r   r   r   r   r   rH  r   r   r   rO  r   B_r   r   ru   rv  accumulators_r   r   r   s      `                @@@r   r   r   l  sS    y/LL	2r<<2q &&	ArQww'\Qww	 MM2::NNBJJMM2::MM2::	

 

 t#KK=BJJ#>K?"a'''"a'''"a'''"a''' &&(F&&(F--/ , 7 7 9 ,(MD!MM!MM!MM!MM!MM!MM!  #  #  #)	
* (+	
, -	
2 L$>$>$@}- %Ar   ri  rj  rk  rm  c)                    t        j                  d      })t        j                  d      }*t        j                  d      }+t        j                  d      },t        j                  d      }-t        j                  |*|+|,|-|'      \  }*}+|||)z  z   ||*z  z   }.t        j                  |.      }/t        j                  |.|z         }0|0|/z
  }1t        j
                  d|"      }2t        j
                  d|$      }3t        j
                  d|#      }4| r.|||)z  z   ||*z  z   ||+z  z   ||2d d d f   z  z   ||4d d d f   z  z   }5| ||)z  z   ||/z  z   ||2d d d f   z  z   ||3d d d f   z  z   }6|||)z  z   ||+z  z   ||3d d d f   z  z   ||4d d d f   z  z   }7|||)z  z   ||*z  z   ||+z  z   ||2d d d f   z  z   ||4d d d f   z  z   }8||	|)z  z   |
|/z  z   }9| r3t        j                  5      j                  |%      }:|r|!s$||z  };|:|;z  }:nt        j                  |"|#f|%      }:t        |1      D ]m  }<t        j                  |6      }=t        j                  |9      }>t        j                  |7||>z  z         }?|:t        j                  |=|?|&|%      z  }:|6|z  }6|9|
z  }9o |!s|:|z  }:t        j                  |8|:j                  |j                  j                               y r  )rq  r  r  r  r  r3  r  r   rd   r  r  r-   r  )@r  r  r  r  r  r  r  r  r  r  r  	input_ptrinput_batch_strideinput_tiled_row_strideinput_tiled_col_strideinput_row_block_strideinput_col_block_strider  r  r  r  r  r  r  r  r  r  r  r  r   r   ri  rj  rk  rl  rn  rm  ro  rI  r   r   r  r  r  r~  r  r  r  r  r  r  inner_block_aranger  
input_ptrsr  r  r  r  r  
beta_alphar  r  r  r  s@                                                                   r   rp  rp    s   p MMq)	1-1-A.A.')||=,n(
$}
 ')34!M12 	 
 WW45
''"9<O"OP "J.99Q6YYq/:99Q6 $y01(=89 )=89 )+;AtG+DD	E
 )+;D!G+DDE  !I-.*,- &(8D(AAB &(:47(CC	D 	  9,-$}45 %'9!T''BBC %'7a'@@	A 	 !I-.%56 &56 &(8D(AA	B
 &(8q(AAB 	 &23 :-. 	 !wwz255i@L!E\
 J. !xx(FiXw 	4A77#45L GG$56M'' #9M#IIK
 kjI! 
 !22!33#	4& % 	.11*2B2B2M2MNOr   r   )NNNNNN)NNNNNNr   )r   )NN)r  FN)T)Cr
  osr  	functoolsr   typingr   r   r   torch._dynamo.utilsr   torch.utils._tritonr   _triton_ops_metar	   intgetenvr
   r   r   r   r+   r3   rA   rF   rR   rW   r^   r`   rw   r   r   r   r   r   r   r   r	  r  rG  rP  rY  Tensorr  r   r`  r   r  triton.languagelanguagerq  jit	constexprr  r  r  r  r  r  r  r0   r  r/  r   rG  r   rp  r7   r   r   <module>rd     s    	   "  ) * & .1BII:A>. *

"(&U;2&0 7F	
/ >B m2l kl 
GTK K\ =>]
 ?]
B  ; FDX 

"&MQ"<<"	" <<" 
%,,	" " uXc]HSM8C=HIJ" 4."T 

"&MQu<<u	u <<u 
%,,	u u uXc]HSM8C=HIJu 4.up < ZZ{4 ll{4 ||	{4
 ||{4 {4> <<?{4@ LLA{4 {4z ZZAPN ||OAPP ||QAPR <<SAPT LLUAPV WAP APF3Gt &*!QUW||WllW llW ell#W W 5#x}!LMNWz '+!QU#5J\\5J||5J ell#	5J
 5J 5#x}!LMN5J tn5Jn ZZV \\V llV VpE
X !%,||,\\, ||, ELL)	,
 , , ,\ ZZEI<<EI<<EI <<EI* ||+EI, -EI. /EI0 LL1EI EIN5
5
5
 LL5
 LL	5

 ll5
n ZZoI LLoI* ||+oI, ||-oI. /oI0 1oI2 3oI4 LL5oI6 LL7oI oIt "&X.X.X. <<X. <<	X.
 <<X. <<X. X. llX. X.t ZZ_PX \\Y_PZ [_P\ ll]_P^ ||__P` ||a_Pb c_Pd <<e_Pf LLg_Ph i_Pj k_P _PD KLM$(!LL $r   