
    sgG                         d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 ddl
mZ d ZddZ	 	 ddZdd	Z G d
 d      Z G d d      Zd ZddZddZd Zedd       ZddZy)    N)contextmanager)AnyDictList   )languagec                    dj                  |       } dddd| z   dg}t        j                  |      }|j                  t        j
                  j                        j                  d      }|D cg c]  }t        |       }}|S c c}w )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounits)	join
subprocesscheck_outputdecodesysstdoutencodingsplitint)attrscmdoutretxs        A/var/www/html/venv/lib/python3.12/site-packages/triton/testing.pynvsmir   
   sy    HHUOEsNU$:<[
\C

!
!#
&C
**SZZ((
)
/
/
4C
a3q6
C
J  s   -Bc                    ddl }|dv sJ |j                  j                         |j                  j                         k(  rt	        d       |         |/|D ]*  }|j                          |j                  d       d|_        , |j                  j                         }|j                  j                  |      5   |         ddd       |j                  j                          |j                  j                  d      }|j                  j                  d      }|j                          |j                          |j                          |j                  j                          |j                  |      }	t        dt!        ||	z              }
|j                  j                         }|j                  j                  |      5  t#        |
      D ]  }||D ]	  }d|_          |          	 ddd       |j                  j                          g }d}t#        |      D ]  }|j                  j                  d      }|j                  j                  d      }|j                          |j                          |j                          |j                  j                          ||j                  |      |
z  gz  } |j%                  |      } t'        ||      |      j)                         S # 1 sw Y   .xY w# 1 sw Y   xY w)	a+  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    r   NminmaxmeanmedianzQCannot capture graph in default stream. Please use side stream in benchmark code.Tenable_timingr   
   )torchcudacurrent_streamdefault_streamRuntimeErrordetach_requires_grad_grad	CUDAGraphgraphsynchronizeEventrecordreplayelapsed_timer!   r   rangetensorgetattritem)fnrepgrad_to_nonereturn_moder'   r   gstart_event	end_eventestimate_msn_repeatir   	n_retriestimess                  r   do_bench_cudagraphrF      s    ::::zz  "ejj&?&?&AAnooD  	AIIKT"AF	 	

A			!	 
	JJ**"""6K

  t 4IHHJ	JJ**95K1c#+,-H 	

A			!	 x 	A'% "A!AF"D		 
JJ
CI9 @jj&&T&:JJ$$4$8		


 ((3h>??@ LLE&75+&u-2244C  s   8K,(K9,K69Lc           	         |dv sJ ddl }|j                  j                  j                  |      }	 |         |	j	                          |r(|j                  t        d      |j                  |      }
n'|j                  t        d      |j                  |      }
|	j                  d      }|	j                  d      }|j                          t        d	      D ]  }|
j                           |          |j                          |	j	                          |j                  |      d	z  }t        d
t        ||z              }t        d
t        ||z              }t        |      D cg c]  }|	j                  d       }}t        |      D cg c]  }|	j                  d       }}t        |      D ]	  } |          t        |      D ]O  }||D ]	  }d|_         |
j                          ||   j                           |         ||   j                          Q |	j	                          |j                  t!        ||      D cg c]  \  }}|j                  |       c}}|j"                        }|P|j%                  ||j                  ||j"                              j'                         }t)        |      d
k(  r|d   }|S  t+        ||      |      j-                         S c c}w c c}w c c}}w )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float]
    :param fast_flush: Use faster kernel to flush L2 between measurements
    :type fast_flush: bool
    r   r   Ng    A)dtypedeviceg    ATr$      r   )rH   )r'   _dynamodevice_interfaceget_interface_for_devicer1   emptyr   int8r2   r3   r6   zero_r5   r!   r.   r7   zipfloatquantiletolistlenr8   r9   )r:   warmupr;   r<   	quantiles
fast_flushr=   device_typer'   dicacher?   r@   _rA   n_warmuprB   rC   r   serE   r   s                          r   do_benchr`   R   s   & ::::		'	'	@	@	MBDNN
 C
O599[QCJejjM (((.Kt,I1X 
 NN**959K 1c&;./0H1c#+,-H9>xIA288$8/IKI7<XG!-GIG8_ 
 8_  #!  	A
! NNLLK8ST1!..+T\a\g\gLhEnnUELL%++L$NOVVXs8q=a&C
&75+&u-22447 JG( Us   $K&K+K0
c                    dd l }dd l}t        | |j                        s|j	                  |       } t        ||j                        s|j	                  |      }|d}t        |      r || j                        n|}|d}t        |      r || j                        n|}t        | |j                        rU| j                  |j                  k(  r| j                         } | j                         j                         j                         } t        ||j                        rU|j                  |j                  k(  r|j                         }|j                         j                         j                         }| j                  dkD  s|j                  dkD  r!|j                  j                  | |||d       y |j                  | |||      st        | d|  d	| d
| d| d
      y )Nr   g{Gz?g        r   T)atolrtol	equal_nan)rb   rc    z is not close to z (atol=z, rtol=))numpyr'   
isinstanceTensorr7   callablerH   bfloat16rR   cpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrb   rc   err_msgnpr'   s          r   assert_closerv      s    a&LLOa&LLO|$TN4=D|$TN4=D !U\\"77enn$	AEEGNN""$!U\\"77enn$	AEEGNN""$ 	vvzQVVaZ


""1ad"N;;q!$T;2y!,=aSvWUYTZZ[\]] 3    c                   t    e Zd ZdZ	 	 	 	 	 	 ddee   dee   dedee   dee   ded	eeef   d
edededefdZ	y)	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    Nx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     || _         || _        |
| _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        y)a  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        N)rz   r{   r   r|   r}   r~   r   stylesr   r   r   r   )selfrz   r{   r|   r}   r~   r   r   r   r   r   r   colorr   s                 r   __init__zBenchmark.__init__   sY    \ 
 "$
"	rw   ) r   FFNN)
__name__
__module____qualname____doc__r   strr   r   boolr    rw   r   ry   ry      s     :c: S	: 	:
 9: I: : 38n: : : : :rw   ry   c            	       8    e Zd Zd Z	 	 d	dedededefdZd
dZy)Markc                      || _         || _        y Nr:   
benchmarks)r   r:   r   s      r   r   zMark.__init__
  s    $rw   bench	save_path
show_plots
print_datac                 	   dd l }dd lm}	 dd l}
|j                  }|j                  D cg c]  }| d	 }}|j                  D cg c]  }| d	 }}t        |j                        }|
j                  ||z   |z   |z         }|j                  D ]  }t        |t
        t        f      s|D cg c]  }| }}t        |      t        |      k7  rt        dt        |       d|       t        t        ||            }g g g }}}|j                  D ]I  } | j                   di ||j"                  |i|j$                  |}	 |\  }}}||gz  }||gz  }||gz  }K t        |      |z   |z   |z   |j(                  t        |      <    |j*                  r|	j-                          |	j/                         }|d   }t1        |j                        D ]  \  }}||dz      ||dz      }}|j2                  r|j2                  |   d   nd }|j2                  r|j2                  |   d   nd }|j5                  ||   ||   |||       |j7                         j9                         r|j7                         j9                         r|j;                  t<              }|j;                  t<              }|j?                  ||   ||d	|
        |jA                          |jC                  |jD                  xs |       |jG                  |jH                         |jK                  |jL                  rdnd       |jO                  |jP                  rdnd       |r|	jS                          |r8|	jU                  |jV                  jY                  ||j*                   d             |||j                  z      }|r=|jZ                  d   dk(  r+|j\                  j_                         \  }}||   ||   z
  |d<   |r1ta        |j*                  dz          ta        |jc                                |r?|je                  |jV                  jY                  ||j*                   d      d| dd       |S c c}w c c}w c c}w # t&        $ r
 |d d }}}Y 8w xY w)Nr   z-minz-max)columnsz	Expected z values, got r   )labelr   lsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr~   listrz   	DataFramer{   rh   tuplerU   
ValueErrordictrQ   r}   r:   r|   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullallastyperR   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   rT   print	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meanr   y_miny_maxrz   dfr\   x_argsrow_meanrow_minrow_maxrs   r   axfirst_xrC   colstycol0col1s                                  r   _runz	Mark._run  s!   '!!%*%5%56A3d66%*%5%56A3d66u}}%\\'F"2U":U"B\C 	EAa$/ '(1Q((1vW% 9S\N-s!KLL#gq/*F)+RwgH__ #dggVV5>>1*=VVvV;+.(FE5 VH$E7"E7"# #1g07:WDBFF3r7O'	E* ??JJLBajG!%"2"23 V1!!f*~r!f*~u,1LLell1oa(d,1LLell1oa(d7RU!33G||~))+ELLN4F4F4H!LL/E!LL/EOOBwKTQTOUV IIKMM%,,1'2MM%,,'MM5;;%H=MM5;;%H=
BGGLLu6Gt4LMN%***+q(**,JD$DBtH,BvJ%//C'(",,.!IIbggll90A.FGXZ[iZjjkVl!  #	y 76 ) ! ;+.d5EF;s#   QQ#,	Q(Q--R ?R c           	         t        | j                  t              }|r| j                  gn| j                  }g }|rRt        j                  |d       t        t        j                  j                  |d      d      }	|	j                  d       |D ]I  }
|j                   | j                  |
|||fi |       |s+	j                  d|
j                   d       K |r!	j                  d       |	j                          |r	|r|d	   S |S y )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rh   r   ry   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runzMark.runS  s    %dooyA*:doo&

KK	D1Y?EDJJ'( 	HEidiiy*j[TZ[\

]5??*;:FG	H JJ)*JJL!!}$!!rw   N)F   )FFr   F)	r   r   r   r   ry   r   r   r   r   r   rw   r   r   r     s>    % chC) C C CSW CJrw   r   c                       fd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                     t        |       S r   )r   r   s    r   <lambda>zperf_report.<locals>.<lambda>r  s    b*- rw   r   )r   wrappers   ` r   perf_reportr   k  s     .GNrw   c                    ddl }ddlm} | s|j                  j	                         } |j
                  j                  j                  |       d   }|j
                  j                  j                  |       d   }||z  dz  dz  d	z  }|S )
z return DRAM bandwidth in GB/s r   Nr   drivermem_clock_ratemem_bus_widthr   g    .A   )r'   runtimer   r(   current_deviceactiveutilsget_device_properties)rI   r'   r   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr   v  sz    **,MM''==fEFVWM##99&A/RIi'!+c1A5GNrw   c                 J   dd l }ddlm} |s|j                  j	                         }|j
                  j                  j                  |      d   dz  }|j                  j                  |      }|d   dk  r| |j                  k(  sJ d}n| |j                  |j                  fv rd}nr| |j                  |j                  |j                  fv rd}nJ| |j                  t        j                   t        j"                  t        j$                  fv rd	}nt'        d
      ||z  |z  dz  }|S )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)r'   r   r   r(   r   r   r   r   get_device_capabilityfloat16float32int32rk   int16rO   tl
float8e4nvfloat8e4b15float8e5r+   	rH   
clock_raterI   r'   r   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr    s   **,==&&<<VDE[\_``L11&9J!}q%%%U]]EKK00"u}}ennekkBB"uzz2==".."++NN#455J&)99D@FMrw   c                        fd}|S )Nc                 F     t        j                          fd       }|S )Nc                  r   dd l }|j                  t        j                               j	                         }
j                         |j                         k  }|r|dk7  rt        j                  j                  j                  d         }t        j                  d   dd}d|v sJ d       |d   j                  j                  j                  }| d	j                   d
| d}t        j                  ddd|gd|      }	|	j                   dk(  sJ d       dt#        |	j$                        v sJ y  | i | y )Nr   zcuda-memcheck__file__PATH1)r  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r"  	ppid_namerun_cuda_memcheckr   r!  test_idr   r   target_kwargstest_fns             r   r   z1cuda_memcheck.<locals>.decorator.<locals>.wrapper  s!   rzz|499;I - 3 3 5 G Y/%Aww''(;(;J(GH!zz&1UXY F*n,nn* +0099<<b!1!1 2!G9A> nnox%L]agjk~~*e,ee*0C

OCCC((rw   )	functoolswraps)r2  r   r1  s   ` r   	decoratorz cuda_memcheck.<locals>.decorator  s%    		!	) 
"	)" rw   r   )r1  r5  s   ` r   cuda_memcheckr6    s    , rw   c           	   #     K   	 t        j                  g d       t        j                  dddd|  d|  g       t        j                  dddd| d| g       t        dg      d	   }t        d
g      d	   }t        || z
        dk  sJ d|  d       t        ||z
        dk  sJ d| d       d| z  }d|z  dz  }||f t        j                  g d       t        j                  g d       t        j                  g d       y # t        j                  g d       t        j                  g d       t        j                  g d       w xY ww)N)r   r   r   -pmr  r   r   r   z--lock-gpu-clocks=r
   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr&   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   r8  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr  gbpss         r   set_gpu_clockr?    sm    C EF a~>	!
 	 	#M?!M?C	!
 	 123A6678;<,./"4_8L\NZ^6__4==01B6b:N}o]a8bb6)L8&-dl EF AB AB 	 EF AB ABs   EB>D AEAEEc                    dd l }ddlm} |s|j                  j	                         }|j
                  j                  j                  |      d   dz  }|j                  j                         }|d   dk  r/| |j                  k(  rd}nW| |j                  k(  rd}nEt        d	      | |j                  k(  rd}n(| |j                  |j                  fv rd}nt        d	      ||z  |z  d
z  }|S )Nr   r   r   r   r  r       @   r  r  )r'   r   r   r(   r   r   r   r   r  r  r  r+   rk   r  s	            r   get_max_simd_tflopsrC    s    **,==&&<<VDE[\_``L113J!}qEMM!!emm#!455EMM!!u}}enn55!455J&)99D@FMrw   )   Nr"   )   d   NNTr"   r(   )NNr   r   )iF  i  )r3  r   r   r   
contextlibr   typingr   r   r   r   r   r
  r   rF   r`   rv   ry   r   r   r   r  r6  r?  rC  r   rw   r   <module>rI     s     	  
 % " " <5~ flL5^"^J? ?D` `F
:6 C C8rw   