
    sg6                     *    d dl Z d Zd Zd Zd Zd Zy)    Nc                 `   t        j                  d| |      dddf   j                  d|      }t        j                  d||      j                  | d      }|j                  dk(  rdnd}|j                  dk(  rdnd}||z  |z  |d	z  |z  z   ||z  d	z  z   }|dz  dk(  |dz  dk(  z  j	                  t         j
                        }|dz  dk(  |dz  dk(  z  j	                  t         j
                        }	|||	z
  z  }|||	z
  z  }d}
||
z  }||
z  }|| z  |
z  ||
z  z   |z   j                  d
      S )aS  
    This is PyTorch implementation of main part of reorder_meta()
    function, from tools/util/include/cutlass/util/host_reorder.h file
    of CUTLASS source tree.  Furthermore, CUTLASS template for sparse
    GEMM decides upon layout of this matrix, and at the moment for the
    sparse GEMM executed on tensor cores, this is layout described by
    ColumnMajorInterleaved<2> data structure, in
    include/cutlass/layout/matrix.h of CUTLASS source tree.  The
    reordering of meta matrix into meta_reordered matrix calculated
    according to these segments of CUTLASS code is re-implemented here.
    Note that this calculation produces offsets for scattering metadata
    matrix elements into reordered metadata matrix elements (or,
    equivalently, for gathering reordered metadata matrix element back
    into metadata matrix elements).
    r   deviceN                   )torcharangerepeatitemsizetoint8view)m
meta_ncols
meta_dtyper   dst_rowsdst_colsgroup
interweavetopright
bottomleft
interleavecols_majcols_mins                \/var/www/html/venv/lib/python3.12/site-packages/torch/sparse/_semi_structured_conversions.py*_calculate_meta_reordering_scatter_offsetsr!      sd     ||Aq0D9@@JOH||Az&9@@AFH %%*BE ))Q.AJEE!a<:
%	&e
!	"  A"x!|q'89==ejjIHa<1$A):;??

KJ:%%H:%%H
 J:%H*$HqL:%:(==HNNrRR    c                    | j                         dk7  rt        d| j                          d      | j                  \  }}| j                  }t        j
                  }| j                  t        j
                  k(  rt        j                  }ne| j                  t        j                  t        j                  t        j                  fv rt        j                  }nt        d| j                   d      |j                  dz  dz  }|dvrt        d	      |t        j                  k(  r|d
z  dk7  r&t        d| d      |dz  dk7  rt        d| d      |d|z  z  dk7  rt        d| dd|z         | j                  t        j                  k7  r2d}| j                  d||z  |      }|dk7  j                  d      \  }}	}
}n4d}| j                  d||z  |      }|dk7  j                  d      x\  }}
\  }	}|||z  z  }||	z  }| |	z  }| |	 z  }|}|}||z  |z  }||	 z  }||j                  t        j                         dz  z  }||j                  t        j                         dz  z  }| j                  t        j                  k7  roj#                  d|j%                  d            }|j#                  d|j%                  d            }t	        j&                  ||fd      j                  ||dz        }n7j#                  d|j%                  d      dz        j                  ||dz        }||dz  z  }|j                  d||f      j                  |      }|dk(  r=|dddddf   |dddddf   dz  z  |dddddf   dz  z  |dddddf   dz  z  }n|dk(  r||dddddf   |dddddf   dz  z  |dddddf   dz  z  |dddddf   dz  z  |dddddf   d
z  z  |dddddf   dz  z  |dddddf   dz  z  |dddddf   dz  z  }j)                  ||z  f      }t+        ||||      }|j-                  d||j                  d             ||j                  ||      fS )z
    This function converts dense matrix into sparse semi-structured
    representation, producing "compressed" matrix, in the layout used by
    CUTLASS backend, and corresponding metadata matrix.
    r   z)Expected 2-dimensional dense tensor, got -dimensional tensorInvalid datatype z of dense matrixr   r
   )r
   r   z6Invalid number of elements per meta element calculatedr	   r   zNumber of rows of dense matrix z must be divisible by 16r   z must be divisible by 32z"Number of columns of dense matrix z must be divisible by r   r   )dimN                        )r&   RuntimeErrorshaper   r   r   dtypeint32halfbfloat16floatint16r   r   unbindr   int64gather	unsqueezestack	new_emptyr!   scatter_)denser   kr   r   quadbits_per_meta_elemksparsedense_4m0m1m2m3dense_2r   expr0expr1expr2bit0bit1bit2bit3idxs0idxs1sparse0sparse1sparsemeta_4meta_nmetameta_reorderedmeta_offsetss                                  r    )sparse_semi_structured_from_dense_cutlassrY   /   s    yy{a7		}DWX
 	
 ;;DAq\\FJ{{ejj [[
	U^^U[[A	A[[
.u{{m;KLMM'00149V+STTU[[ r6Q;1!4LM  r6Q;1!4LM  	A&&'1,03I!NdJdIef
 	
 {{ekk!**Rgw7!Q,..r2BB**Rgw7"a<//33BRw!778JH GEC"HEC2#IEDD5=2DB3;DDGGEKK(A-.EDGGEKK(A-.E{{ekk!..U__R%89..U__R%89gw/R8==aaHEOOB$71$<=BB1a1fMeqj!F[["j*@ABEEjQF"1a7OaAg!#%aAg!#% aAg"$& 	 
 1	$1a7OaAg!#%aAg!#% aAg"$& aAg"$	&
 aAg"$& aAg"$& aAg"$& 	 ^^Q^$56N=	:z6L A|TYYr];N'':677r"   c                 |	   | j                         dk7  rt        d| j                          d      | j                  \  }}| j                  }|j                         dk7  rt        d|j                          d      |j                  |k7  rt        d| d|j                   d      |j                  }|t
        j                  t
        j                  fvrt        d| d	      |j                  d
z  dz  }| j                  t
        j                  k7  rd}nd}|j                  \  }}	||k7  rt        d| d|       |	|z  |z  d|z  k7  rt        d| d|	|z  |z  dz   d      t        ||	||      }
t        j                  |j                  d      d|
      j                  ||	      }t        j                  ||	d|z  f||      }|dk(  r|dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |d
z	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   n#|d
k(  r|dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |d
z	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |dddddf<   |dz	  dz  |ddddd
f<   |dz	  dz  |dddddf<   |d z	  dz  |dddddf<   |d!z	  dz  |ddddd"f<   |d#z	  dz  |dddddf<   |d$z	  dz  |ddddd%f<   |d&z	  dz  |dddddf<   |d'z	  dz  |ddddd(f<   |j                  d      t        j                  dd|z  |z  |z  |)      dz  j                  dd      j                  dd      j                  d      z   }t        j                   |dz  |z  f| j                  |      }| j                  t
        j                  k7  r#|j#                  d|| j                  d             n\|j                  t
        j$                        j#                  d|| j                  t
        j$                        j                  d             |j                  |d|z        S )*z
    This function performs reverse of the function above - it
    reconstructs dense matrix from a pair of "compressed" matrix, given
    in the layout used by CUTLASS backend, and accompanying metadata
    matrix.
    r   z*Expected 2-dimensional sparse tensor, got r$   z(Expected 2-dimensional meta tensor, got zExpected meta matrix to be on z device, got matrix on z devicer%   z of meta matrixr   r
   zNumber of rows of meta matrix z4 must be equal to number of columns of spase matrix z#Number of columns of sparse matrix z different from the z<, expected according to the number of columns of meta matrixr   r   r1   r   r'   Nr   r+   
   r)   r(      r-   r	      	   r*         r,         r.         r   )r&   r/   r0   r   r1   r   r6   r2   r   r5   r!   r9   r   emptyr   r   zerosr=   r3   )rS   rW   r   r?   r   r   r@   rA   
meta_nrowsr   rX   rV   meta_2dense_offsetsr>   s                  r    'sparse_semi_structured_to_dense_cutlassrk      sJ    zz|q8FYZ
 	
 <<DAq]]Fq 6~7I7I7K6LL_`
 	
 &,VH4KNLaLaKbbij
 	
  %%J%++u{{33.zl/JKK'00149||u{{"+11J
Q,ZL8lmnlop
 	
 G44A=1!4HV]I]`vIvz{I{H| }I I
 	
 >	:z6L <<++B/LAFFq*UD [[	
J223F
 "+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw	1	$+q!Qw19,q!Qw19,q!Qw19,q!Qw19,q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw2:-q!Qw BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!Rx BJ$.q!RxKKOQA	W,V<q@
d2qk&&A,ttBx(M KKQFLLHE||u{{"q-R9

5::''}fkk%**5::2>	
 ::aQr"   c                 |    d }| j                  ddd      j                  ddd      D ]  }|D ]
  } ||         | S )a  
    This function computes a 2:4 sparse tile by greedily taking the largest values.

    Since we take the largest values greedily, how the sorting algorithm handles duplicates affects
    the ultimate sparsity pattern.

    Note that this function does not have the same sorting semantics as our CUDA backend,
    which is exposed via `torch._sparse_semi_structured_tile` and thus returns a different pattern.
    c                     g d}g d}| j                         j                  dd      j                  D ]>  }|dz  |dz  }}||   dk  r#||   dk  r||xx   dz  cc<   ||xx   dz  cc<   8d| ||f<   @ y )N)r   r   r   r   T)
descendingstabler
   r   r   r   )flattensortindices)tilenum_kept_rownum_kept_colxrcs         r    greedy_prune_tilez7_sparse_semi_structured_tile.<locals>.greedy_prune_tile(  s    ##$$T$BJJ 	A61q5qAA"|A':Q1$Q1$QT
	r"   r   r
   r   )unfold)r>   ry   batchrs   s       r    _sparse_semi_structured_tiler|     sR    
 aA&--aA6 $ 	$Dd#	$$ Lr"   c                 
   | j                         j                  t        j                        }|j	                  ddd      j	                  ddd      }|j	                  ddd      j	                  ddd      } |j
                  g |j                  dd ddd }dt        j                  dt        j                  d	      z  }|j                  t        j                        |z  j                  t        j                        }|S )
zH
    Calculates the compressed swizzled bitmask from a dense tensor
    r   r   r   r   r
   r'   Ncudar[   )	boolr   r   uint8rz   reshaper0   r   r5   )r>   int_bitmaskbitmask_8x8_chunksbitmask_4x4_chunksbitmask_binary_representationpowers_of_twocompressed_swizzled_bitmasks          r    $_compute_compressed_swizzled_bitmaskr   ;  s     **,//%++.K %++Aq!4;;Aq!D ,221a;BB1aK %?$6$>$> %		!	!"1	%%'(%*+%-.%!
 au{{6JJM 	&((5Ebo   '&r"   )r   r!   rY   rk   r|   r    r"   r    <module>r      s'    'STB8Jf R<)'r"   