
    sg(                         d dl Z d dlZd dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ g dZd Zd	 Zd
 Zd Ze j"                   G d d             Zd Zd Zy)    N)defaultdict)
DeviceType   )benchmarker)create_bandwidth_info_strget_num_bytes)foreachpersistent_reduction	pointwise	reduction
split_scantemplatec                 j    t         D cg c]  }d| | v s| }}t        |      dk(  r|d   S yc c}w )z
    Similar to get_kernel_category but use the source code. Call this API
    if we have not compile the src_code to module yet.
    z@triton_heuristics.r   r   unknown)_kernel_category_choiceslen)src_codechchoicess      T/var/www/html/venv/lib/python3.12/site-packages/torch/_inductor/wrapper_benchmark.py"get_kernel_category_by_source_coder      sM     .3Frd1Kx1WG  7|qqzs   00c                 x    t         D cg c]  }|| j                  v s| }}t        |      dk(  r|d   S yc c}w )a  
    Given the module defining a triton kernel, return the category of the kernel.
    Category can be one of:
    - pointwise
    - reduction
    - persistent_reduction

    Currently we simply decide the category depending on what decorator is imported
    by the kernel.
    r   r   r   )r   __dict__r   )
kernel_modr   r   s      r   get_kernel_categoryr   %   sC     5Rbj>Q>Q8QrRGR
7|qqz	 Ss   77c                     ddl m} | j                  j                         D cg c]$  \  }}|j	                  d      rt        ||      r|& }}}t        |      dk(  sJ |d   S c c}}w )Nr   )CachingAutotunertriton_r   ))torch._inductor.runtime.triton_heuristicsr   r   items
startswith
isinstancer   )modr   kv	cand_lists        r   get_triton_kernelr'   7   sn    J LL&&(Aq<<	"z!5E'F 	
I 
 y>QQ<s   )A%c                 D   ddl m} d}|j                  j                         D ]  \  }t	        d      rt	        d      s t              }t              }j                         t        |j                  j                  D cg c]  }|j                  d      r| c}      }|j                  j                  dd      t        d|id	z  dfd
	}	| dd|dd j                          d|dd  }
|rt	        d      sJ j!                        }t#        |
       |j                         D ]G  \  }}t#        d |	||j$                  |j&                  |j(                         d|j*                          I nt-        j.                  fddd      }t        |j0                        dk(  sJ d       |j0                  d   }t#         |	||j$                  |j&                  |j(                  |
 d             |dz  } |dk(  rt#        d       yyc c}w )aX  
    An experimental API used only when config.benchmark_kernel is true.

    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
    Used in the compiled modules.

    Put this method here rather than codegen it for convenience since its implementation
    does not change based on different graph modules being compiled.
    r   )PyCodeCacheget_argscall
in_out_ptrkernel_num_gbNnum_in_out_argsg    eAc                     t        d |||fD              sd|dd|dd|dd}nd}| d	z  z  }t        | |||
      S )Nc              3   $   K   | ]  }|d u  
 y wN ).0xs     r   	<genexpr>z>benchmark_all_kernels.<locals>.get_info_str.<locals>.<genexpr>c   s     EQqDyEs     3z regs  z	 spills  8z shared mem g     @@)prefixsuffix)anyr   )msn_regsn_spillssharedr:   kernel_detail_strgb_per_snum_gbs          r   get_info_strz+benchmark_all_kernels.<locals>.get_info_strb   sj    E68V*DEE
'(1YvajT " %'!c*H,FHV<M     20    
   benchmark_all_configsr6   z @ c                  &    j                         S r1   )r+   )argsr   s   r   <lambda>z'benchmark_all_kernels.<locals>.<lambda>|   s    
- rE   (   T)rep
fast_flushr   z.Autotuner should have selected the best config)r:   zpNo kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True)r9   )torch._inductor.codecacher)   cacher    hasattrr'   r   r*   r   fn	arg_namesr!   inductor_metagetr   upperrJ   printr>   r?   r@   configr   benchmark_gpu	launchers)benchmark_namerJ   r)   nfound
kernel_keytriton_kernelkernel_categoryarg_namenum_in_out_ptrsrD   kernel_descbench_resultlauncherr=   rL   r   rC   s                 @@@r   benchmark_all_kernelsrg   C   sG    6F"-"3"3"9"9"; <
Jz:.gj&6Q)*5-j9""$ !. 0 0 : :&&|4 
 ,,00$G>"DJ/JSPF	 b!?2A#6#<#<#>"?qCR@QR 	 !:'>???%;;DAL+ , 2 2 4 "b(//8;L;Lhoo^__bckcrcrbst
 **-2$B M++,1@?@1$..q1HOO%%OO)]!, 	!y<z {~	
 ks   	H
c                   6    e Zd ZU eed<   eed<   eed<   eed<   y)ProfileEventcategorykeyself_device_time_mscountN)__name__
__module____qualname__str__annotations__floatr2   rE   r   ri   ri      s    M	H LrE   ri   c                    	
 fd
t        t              		
fd}|D ]  }|j                  rJ d       |j                  t        j
                  k(  r4d}|j                  j                  d      r\|j                  j                  d      rd}n>|j                  j                  d      rd	}n |j                  j                  d
      rd}nd} |||        fd	 fd} |        y )Nc                 (    | j                   dz  z  S )zV
        ev.self_device_time_total is in microsecond. Convert to millisecond.
          )self_device_time_total)evnrunss    r   get_self_device_timez6parse_profile_event_list.<locals>.get_self_device_time   s     ((4/%77rE   c                     t        || j                   |       | j                  z        }|   j                  |       y )N)rj   rk   rl   rm   )ri   rk   rm   append)rx   rj   
profile_ev
all_eventsrz   ry   s      r   	add_eventz+parse_profile_event_list.<locals>.add_event   s@    ! 4R 8((U"	

 	8##J/rE   z!Don't support the legacy profilerr   r   
triton_poitriton_pointwise
triton_redtriton_reduction
triton_pertriton_persistent_reductiontriton_unknownc           	         ddl m } |j                  d d       g }d}t        d|  d       |D ]]  }||j                  z  }|j                  z  d	z  d
d}|j	                  |j
                  d d |j                  |j                  |g       _ |j	                  d|d|z  d	z  d
dg       t         ||ddj                          dddg             |S )Nr   )tabulatec                     | j                   S r1   )rl   )rx   s    r   rM   zCparse_profile_event_list.<locals>.report_category.<locals>.<lambda>   s    2+A+A rE   T)rk   reverse        z
  == z category kernels == d   .2f%x   Totalr9   KernelzSelf z
 TIME (ms)CountPercent)headers)r   sortrY   rl   r|   rk   rm   rX   )	rj   profile_eventsr   rows
total_timerx   percentdevice_namewall_time_mss	          r   report_categoryz1parse_profile_event_list.<locals>.report_category   s   % A4P
z!678  	SB"000J//,>DSIKGKKr'='=rxxQR	S 	j"l)BS)H(MQ&OP	
 	K--/0
;	
	
 rE   c                      g d} t        j                               j                  t        |             sJ t        j                                       i }d}| D ]  }|v s ||         }|||<   ||z  } |z  dz  dd}t	        d
j                          d|        t	        dd	d
       d	 }| D ]&  }|j                  |d      z  dz  dd}|d| z  }( |d| dd	dz  }t	        |       y )N)r   r   r   r   r   r   r   r   r   z
Percent of time when z
 is busy: zTotal wall time z.3fz mszOutput for tabulate: z, r=   )setkeysissubsetlistrY   rX   rW   )category_listper_category_wall_timetotal_device_msrj   _timedevice_busy_percenttabulate_liner   r~   r]   r   r   r   s           r   reportz(parse_profile_event_list.<locals>.report   ss   
 :??$%..
 	(:??$%&	( 
 "$% 	)H:%'*X2FG38&x05(		) "1<!?#!Ec J!L%k&7&7&9%:*EXDYZ	
 	 c 2#67 0/?@% 	,H)--h<|KcQRUVVWX  r'^+M		,
 	212"\#4FbIImrE   )r   r   	is_legacydevice_typer   CPUrk   r!   )r]   
event_listr   ry   r   r   rx   rj   r   r~   rz   r   s   ` ```    @@@r   parse_profile_event_listr      s    8 T"J0   <<D!DD>>Z^^+66Y'vv  .-""<0-""<08+"h# &8& &P HrE   c                    ddl }|j                         }|j                  dddd       |j                  dd	dd
       |j                  dddd       |j                         }|j                  rt        | |j                         yd}d} |||      dz  }|j                  syt        j                  j                  d      5 } |||       ddd       t        j                          d}	j                  |	       t        d|  d       t        d|	        |j                  d      }
t        |
j                  dd             t!        | |
|||z  |j"                         y# 1 sw Y   xY w)zM
    This is the function called in __main__ block of a compiled module.
    r   Nz--benchmark-kernelsz-k
store_truez,Whether to benchmark each individual kernels)actionhelpz--benchmark-all-configsz-cz8Whether to benchmark each individual config for a kernelz	--profilez-pz&Whether to profile the compiled modulerI   )timesrepeatrv   T)record_shapesz/compiled_module_profile.jsonz4Profiling result for a compiled module of benchmark :z+Chrome trace for the profile is written to )group_by_input_shaperw   )sort_by	row_limit)argparseArgumentParseradd_argument
parse_argsbenchmark_kernelsrg   rJ   profiletorchprofilertempfile
gettempdirexport_chrome_tracerY   key_averagestabler   
use_device)r]   benchmark_compiled_module_fnr   parserrL   r   r   r   ppathr   s              r   compiled_module_mainr     s    $$&F
;	   !G	   5	   Dnd.H.HI3%ORVV||^^##$#7 	E1(uVD	E %%'((EF	d#D^DTTUVW;D6BC^^^>
j'?2NO Jefnall	
	E 	Es   E((E1)dataclassesr   collectionsr   r   torch.autogradr   runtime.benchmarkingr   runtime.runtime_utilsr   r   r   r   r   r'   rg   	dataclassri   r   r   r2   rE   r   <module>r      sc      #  % - K $	M
`   k\0
rE   