
    sg)$                        d dl Z ddlmZmZmZmZmZ ddlmZ ddl	m
Z
mZ e j                  e j                  e j                  e j                  gZd Zd Zd	 Zd
 Z e eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd       eddddddd      g e       z   g de
edd       edd i      edej*                  dej*                  dej*                  dej*                  dej*                  d ej*                  d!ej*                  d"ej*                  dej*                  d#ej*                  fd$                     Z G d% d&e j.                  j0                        Zej4                  Zy)'    N   )Configautotunecdiv
heuristicsjit)language   )early_config_pruneestimate_matmul_timec                 @    dt        |       v rt        j                  S | S )Nfp8)strtorchfloat16)as    D/var/www/html/venv/lib/python3.12/site-packages/triton/ops/matmul.pyupcast_if_fp8r   
   s    A}}H    c                     t        |       } t        |      }| |u r| S | t        v sJ |t        v sJ t        D ]  }| |u r|c S ||u s| c S  y N)r   _ordered_datatypes)r   bds      r   get_higher_dtyper      sg    aAaAAv"""""""" 6H6H	r   c                       fdS )Nc                 *    |    j                         S r   )zero_)nargsnames    r   <lambda>zinit_to_zero.<locals>.<lambda>!   s    t**, r    )r    s   `r   init_to_zeror#       s	    ,,r   c                      g } dD ]u  }dD ]n  }dD ]g  }dD ]`  }|dk  rdnd}| j                  t        |||dd	||
             dD ].  }| j                  t        ||||d	||t        d                   0 b i p w | S )N)r               )       )r*   @   )r*   r+         r+   r   r&   r
   BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)r   r&      r)   C)r4   r5   pre_hook)appendr   r#   )configsr4   block_mblock_kblock_nr5   split_ks          r   get_configs_io_boundr?   $   s    G% l
 	lG# 
l1 	lG%,]INN7wSZghi*4	KL $1 l"w7W^kr#s.8IXdehXikll	l
l	ll Nr   r,   r-   r*   r.   r%   r6   r3   r+   r&   r'   )MNK
   )r   
perf_modeltop_k)r:   keyprune_configs_byEVEN_Kc                 *    | d   | d   | d   z  z  dk(  S )NrB   r1   r2   r   r"   )argss    r   r!   r!   U   s!    49Y$y/(IJaO r   	acc_dtypeinput_precisionfp8_fast_accumr/   r0   r1   GROUP_Mr2   AB_DTYPEc                 $   t        j                  d      }t        j                  d      }t        j                  ||      }t        j                  ||      }||z  }||z  }t        |||z  z
  |      }||z  ||z  z   }||z  |z  }||z  t        j                  d|      z   }||z  t        j                  d|      z   } t        j
                  t        j                  ||z  |      |      }!t        j
                  t        j                  | |z  |      |      }"||z  t        j                  d|      z   }#| |!d d d f   |z  |#d d d f   |z  z   z   } ||#d d d f   |z  |"d d d f   |	z  z   z   }t        j                  ||f|      }$t        dt        j                  |||z              D ]  }%|r+t        j                  |       }&t        j                  |      }'nz||%||z  z  z
  }(t        j                  d|j                  j                        })t        j                  | |#d d d f   |(k  |)      }&t        j                  ||#d d d f   |(k  |)      }'|"|&j                  |      }&|'j                  |      }'|rt        j                  |&|'|$||      }$n|$t        j                  |&|'||      z  }$| ||z  |z  z  } |||z  |z  z  }  |$j                  |j                  j                        }$||z  t        j                  d|      z   }||z  t        j                  d|      z   } ||d d d f   |
z  | d d d f   |z  z   z   }||k  d d d f   | |k  d d d f   z  }*|dk(  rt        j                  ||$|*       y t        j                  ||$|*       y )Nr   r
   )dtype)r
   r
   )maskother)	out_dtyperL   )rR   )tl
program_idr   minarangemax_contiguousmultiple_ofzerosrangeloadrQ   
element_tytodotstore
atomic_add)+ABr7   r@   rA   rB   	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnrK   rL   rM   r/   r0   r1   rN   r2   rH   rO   pidpid_zgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkacckr   r   k_remaining_0rR   s+                                              r   _kernelr}   6   si   Z --
CMM!EWWQ FWWQ FfEe|HVh00':Jw#
"23E5[j)E	299Q0	0B	299Q0	0B


BNN267;W
EC


BNN267;W
EC	299Q0	0B	SD\I%47i(??@A	R4[9$s47|i'??@A
((GW%Y
7C1bgga7!234 +
A
Aa7W#455K&(:(:;B47k 9DA1d7k 9DAXAXA&&AsiYC266!Q)_UUC	Ww**	Ww**#+$ &&##
$C	299Q0	0B	299Q0	0B	R4[9$r$'{Y'>>?AFAtGQa00D!|
Cd#
a4(r   c                   6    e Zd ZeZi Zed        Zedd       Zy)_matmulc                    | j                   }| j                  d      dkD  r$| j                  d      dkD  r| j                         } |j                  d      dkD  r$|j                  d      dkD  r|j                         }| j                  d   |j                  d   k(  sJ d       | j                  \  }|j                  \  }t	        | j
                  |j
                        }	||	}t        j                  f||      }
t        j                  t        j                  t        j                  ft        j                  t        j                  t        j                  ft        j                  t        j                  ft        j                  t        j                  fi}|	||	   d   }nQt        |t        j
                        sJ d       ||| j
                     v sJ d       |||j
                     v sJ d       d } ||      } ||	      }	 ||      }| j
                  t        j                  t        j                   fv r.|j
                  t        j                  t        j                   fv rd }	fd	}t#        |   | ||
|| j                  d      | j                  d      |j                  d      |j                  d      |
j                  d      |
j                  d      |||d
|	       |
S )Nr   r
   zincompatible dimensions)devicerQ   zacc_dtype must be a torch.dtypez+acc_dtype not compatible with the type of az+acc_dtype not compatible with the type of bc                 X    t        t        t        |       j                  d      d         S )N.)getattrrU   r   split)tys    r   
to_tl_typez!_matmul._call.<locals>.to_tl_type   s!    2s2w}}S1"566r   c                 L    t        | d         t        | d         z  | d   fS )Nr/   r0   r2   )r   )METAr@   rA   s    r   r!   z_matmul._call.<locals>.<lambda>   s.    T!T)_5QY8PPRVW`Rab r   r6   )rK   rL   rM   rN   rO   )r   stride
contiguousshaper   rQ   r   emptyr   float32bfloat16int8int32
isinstancerU   
float8e4nvfloat8e5r}   )r   r   rK   rL   rM   output_dtyper   rB   _ab_dtypecsupported_acc_dtypesr   gridr@   rA   s                 @@r   _callz_matmul._call   sh   88A;?qxx{QA88A;?qxx{QAwwqzQWWQZ'B)BB'ww1ww1 $AGGQWW5  #LKKAv\B MMEMM5==95>>EMM[`[i[iKjMMEMM,ejj5;;/ 

 ,X6q9Ii5X7XX5 4QWW ==l?ll= 4QWW ==l?ll=	7 y)	h'!,/ 77r}}bkk22qww2==RTR]R]B^7^Hbq!Q1HHQK!HHQK!HHQK!+)	* r   Nc                 8    t         j                  ||||||      S )N)rK   rL   rM   r   )r   r   )ctxr   r   rK   rL   rM   r   s          r   forwardz_matmul.forward   s&    }}QYhv*6  8 	8r   )NNTN)	__name__
__module____qualname__r}   kernel_locksstaticmethodr   r   r"   r   r   r   r      s3    FF7 7r 8 8r   r   )r    r   r   r   r   r   r	   rU   matmul_perf_modelr   r   r   r   r   r   r   r   r   r#   r?   	constexprr}   autogradFunctionr   applymatmulr"   r   r   <module>r      s    6 6  Gjj%--O  -$ 
 	332!LYZfgh332!LYZfgh32"KXYefg2#"KXYefg332!LYZfgh32"KXYefg2#"KXYefg32"KXYefg2"JWXdef3331MZ[ghi3331MZ[ghi32#!LYZfgh2##!LYZfgh3331MZ[ghi32"KXYefg2#"KXYefg32"KXYefg2"JWXdef)* 	+, 	0*1< O  ;) ||	;)
  \\;) LL;) \\;) -/LL;) DF<<;) \\;) -/LL;) CE,,;) []ZfZf;) =D;)|B8enn%% B8J 
r   