
    Xh4O                        d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ d Zd Zd	 ZddZddZd dZ G d d          Z G d d          Zd Zd!dZd!dZd Zed"d            Zd!dZdS )#    N)contextmanager)AnyDictList   )language)runtimec                     d                     |           } dddd| z   dg}t          j        |          }|                    t          j        j                                      d          }d |D             }|S )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 ,    g | ]}t          |          S  )int.0xs     `/var/www/tools.fuzzalab.pt/emblema-extractor/venv/lib/python3.11/site-packages/triton/testing.py
<listcomp>znvsmi.<locals>.<listcomp>   s    


a3q66


    )join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutrets       r   nvsmir$      ss    HHUOOEsNU$:<[
\C

!#
&
&C
**SZ(
)
)
/
/
4
4C

3


CJr   c                 l     t                     t                       fdfd|D             S )Nc                     d| cxk    rdk    sn t          d          | dz
  z  }t          j        |          }t          j        |          }||z
  }d|z
  |         z  ||         z  z   S )Nr   r   z%Quantiles must be in the range [0, 1])
ValueErrormathfloorceil)qpointloweruppertans        r   get_quantilez_quantile.<locals>.get_quantile   s}    Q!DEEEQU
5!!	%  EMA5!A%L00r   c                 &    g | ]} |          S r   r   )r   r+   r2   s     r   r   z_quantile.<locals>.<listcomp>'   s!    '''LLOO'''r   )lensorted)r0   r+   r2   r1   s   ` @@r   	_quantiler6      sU    AAq		A1 1 1 1 1 1 ('''Q''''r   c                 0   |-t          | |          }t          |          dk    r|d         }|S |dk    r| S |dk    rt          |           S |dk    rt          |           S |dk    rt	          j        |           S |dk    rt	          j        |           S d S )Nr   r   allminmaxmeanmedian)r6   r4   r9   r:   
statisticsr;   r<   )times	quantilesreturn_moder#   s       r   _summarize_statisticsrA   *   s    y))s88q==a&C
e			5zz			5zz			u%%%		 	  ''' 
!	 r      r;   c                    ddl }|dv sJ |j                            |j                                                  5   |              |5|D ]2}|                                 |                    d           d|_        3|j                            d          }|j                            d          }|                                 t          d          D ]}	 |              |                                 |j        
                                 |                    |          dz  }
|
dk    rd}n t          dt          ||
z                      }|j                                        }|j                            |          5  t          |          D ]}	||D ]	}d|_        
 |              	 ddd           n# 1 swxY w Y   |j        
                                 g }d	}t          |          D ]}	|j                            d          }|j                            d          }|                                 |                                 |                                 |j        
                                 ||                    |          |z  gz  }t#          |||          cddd           S # 1 swxY w Y   dS )
a  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    r   Nr9   r:   r;   r<   r8   Tenable_timing   i  r   
   )torchcudastreamStreamdetach_requires_grad_gradEventrecordrangesynchronizeelapsed_timer:   r   	CUDAGraphgraphreplayrA   )fnrepgrad_to_noner?   r@   rI   r   start_event	end_event_estimate_msn_repeatgr#   	n_retriess                  r   do_bench_cudagraphrb   <   s|    LLLAAAAA			5:,,..	/	/ 0B 0B
#!  		  &&& j&&T&::J$$4$88	q 	 	ABDDDD
   !..y99A=!HH1c#"34455H J  ""Za   	 	8__  +) & &!%		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	
   	y!! 	D 	DA****>>K
((t(<<I   HHJJJJ""$$$K,,Y77(BCCCC$S)[AAa0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0B 0Bs7   E	J;+F=1J;=G	J;G	C)J;;J?J?   d   c                    |dv sJ t           j        j                                         |                                               t           j        j                                        }                    d          }                    d          }|                                 t          d          D ]0}	t           j        j        	                    |            |              1|                                                                  |
                    |          dz  }
t          dt          ||
z                      }t          dt          ||
z                      }fdt          |          D             }fdt          |          D             }t          |          D ]}	 |              t          |          D ]r}||D ]	}d|_        
t           j        j        	                    |           ||                                           |              ||                                          s                                 d	 t          ||          D             }t          |||          S )
a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float], optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    rD   TrE   rG   r   c                 <    g | ]}                     d           S TrE   rP   r   idis     r   r   zdo_bench.<locals>.<listcomp>   s'    IIIA288$8//IIIr   c                 <    g | ]}                     d           S rg   rh   ri   s     r   r   zdo_bench.<locals>.<listcomp>   s'    GGG!--GGGr   Nc                 >    g | ]\  }}|                     |          S r   )rT   )r   ses      r   r   zdo_bench.<locals>.<listcomp>   s(    GGG41aQ^^AGGGr   )r	   driveractiveget_device_interfacerS   get_empty_cache_for_benchmarkrP   rQ   rR   clear_cacherT   r:   r   rO   ziprA   )rX   warmuprY   rZ   r?   r@   cacher[   r\   r]   r^   n_warmupr_   rj   r   r>   rk   s                   @r   do_benchry      sn   $ AAAAA			3	3	5	5BBDDDNNN!??AAE (((..Kt,,I1XX  ))%000
NN**9559K 1c&;.//00H1c#+,,--HIIIIxIIIKGGGGuXGGGI8__  
8__   #!  ))%000A
!NNGG3{I+F+FGGGE 	;???r    c                    ddl }ddl}t          | |j                  s|                    |           } t          ||j                  s|                    |          }|d}t          |          r || j                  n|}|d}t          |          r || j                  n|}t          | |j                  r\| j        |j        k    r|                                 } | 	                                
                                                                 } t          ||j                  r\|j        |j        k    r|                                }|	                                
                                                                 }| j        dk    s|j        dk    r!|j                            | |||d           dS |                    | |||          st          | d	|  d
| d| d| d
          dS )a  
    Asserts that two inputs are close within a certain tolerance.

    :param x: The first input.
    :type x: scala, list, numpy.ndarray, or torch.Tensor
    :param y: The second input.
    :type y: scala, list, numpy.ndarray, or torch.Tensor
    :param atol: The absolute tolerance. Default value is 1e-2.
    :type atol: float, optional
    :param rtol: The relative tolerance. Default value is 0.
    :type rtol: float, optional
    :param err_msg: The error message to use if the assertion fails.
    :type err_msg: str
    r   Ng{Gz?g        r   T)atolrtol	equal_nan)r|   r}    z is not close to z (atol=z, rtol=))numpyrI   
isinstanceTensortensorcallabledtypebfloat16floatcpudetachsizetestingassert_allcloseallcloseAssertionError)r   yr|   r}   err_msgnprI   s          r   assert_closer      s    LLL a&& LLOOa&& LLOO|$TNN444===D|$TNN444===D !U\"" %7en$$		AEEGGNN""$$!U\"" %7en$$		AEEGGNN""$$ 	vzzQVaZZ

""1ad"NNN;;q!$T;22 ^\\!\\a\\\\UY\\\]]]^ ^r   c                       e Zd ZdZ	 	 	 	 	 ddee         dee         dedee         d	ee         d
edeeef         dedededefdZ	dS )	Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rz   FNx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     || _         || _        |
| _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        || _        dS )aq  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        :param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
        :type styles: list[tuple[str, str]]
        N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   s                r   __init__zBenchmark.__init__   s]    ^ 
 "$
"			r   )rz   rz   FFN)
__name__
__module____qualname____doc__r   strr   r   boolr   r   r   r   r   r      s          ; ;c; S	; 	;
 9; I; ; 38n; ; ; ; ; ; ; ; ; ;r   r   c            	       :    e Zd Zd Z	 	 ddedededefdZdd
ZdS )Markc                 "    || _         || _        d S NrX   
benchmarks)r   rX   r   s      r   r   zMark.__init__9  s    $r   F   bench	save_path
show_plots
print_datac                 .	   dd l }dd lm}	 dd l}
|j        }d |j        D             }d |j        D             }t          |j                  }|
                    ||z   |z   |z             }|j        D ]t          t
          t          f          sfd|D             t                    t          |          k    r"t          dt          |           d           t          t          |                    }g g g }}}|j        D ]Q} | j        di ||j        |i|j        |}	 |\  }}}n# t&          $ r	 |d d }}}Y nw xY w||gz  }||gz  }||gz  }Rt                    |z   |z   |z   |j        t          |          <   |j        r4|	                                 |	                                }|d         }t1          |j                  D ]\  }}||dz            ||d	z            }}|j        r|j        |         d         nd }|j        r|j        |         d
         nd }|                    ||         ||         |||           |                                                                sz|                                                                sT|                    t<                    }|                    t<                    }|                    ||         ||d|           |                                  |!                    |j"        p|           |#                    |j$                   |%                    |j&        rdnd           |'                    |j(        rdnd           |r|	)                                 |r6|	*                    |j+        ,                    ||j         d                     |||j        z            }|rA|j-        d
         dk    r0|j.        /                                \  }}||         ||         z
  |d<   |r8ta          |j        dz              ta          |1                                           |r=|2                    |j+        ,                    ||j         d          d| dd           |S )Nr   c                     g | ]}| d S )-minr   r   s     r   r   zMark._run.<locals>.<listcomp>D      666A666r   c                     g | ]}| d S )-maxr   r   s     r   r   zMark._run.<locals>.<listcomp>E  r   r   )columnsc                     g | ]}S r   r   )r   r]   r   s     r   r   zMark._run.<locals>.<listcomp>K  s    (((1Q(((r   z	Expected z values, got r   r   r   )labelcolorlsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   r   tupler4   r'   dictru   r   rX   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullr8   astyper   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   tolistprint	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meany_miny_maxr   dfx_argsrow_meanrow_minrow_maxr   r#   axfirst_xrj   colstycol0col1r   s                                @r   _runz	Mark._run=  s   			''''''!66U%566666U%5666u}%%\\'F"2U":U"B\CC 	E 	EAa$// )(((((((1vvW%% !KS\\!K!K!K!KLLL#gq//**F)+RwgH_ # #dgVVV5>1*=VVvVV;+.(FE55  ; ; ;+.d5EFFF;VH$E7"E7""1gg07:WDBF3r77OO? 	OJJLLLBajG!%"233 V V1!!f*~r!f*~u,1LBel1oa((d,1LBel1oa((d7RU!33GGG||~~))++ VELLNN4F4F4H4H V!LL//E!LL//EOOBwKTQTOUUUIIKKKMM%,1'222MM%,'''MM5;<%%H===MM5;<%%H=== 


 OBGLLu4L4L4LMMNNN%**+ 	-q((**,,JD$DBtH,BvJ 	"%/C'(((",,..!!! 	#IIbgll9.F.F.FGGVl[iVlVlVl!  # # #	s   .D55EErz   c           	         t          | j        t                    }|r| j        gn| j        }g }	 |D ]&}	|                     | j        |	|||fi |           '	 |rt          j        |d           t          t
          j        	                    |d          d          5 }
|

                    d           |d t          |                   D ] }	|

                    d|	j         d           !|

                    d           d d d            n# 1 swxY w Y   n# |rt          j        |d           t          t
          j        	                    |d          d          5 }
|

                    d           |d t          |                   D ] }	|

                    d|	j         d           !|

                    d           d d d            w # 1 swxY w Y   w w xY w|r|r|d	         S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )r   r   r   appendr   r   makedirsopenr   r   writer4   r   )r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfsr   htmls              r   runzMark.run  s   %doyAA*:Odo&&

	3# a a!!)$)E9j*"_"_X^"_"_````a  3I5555"',,y.AA3GG 34JJ/000!+,<S__,<!= P P

#N5?#N#N#NOOOOJJ1222	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3  3I5555"',,y.AA3GG 34JJ/000!+,<S__,<!= P P

#N5?#N#N#NOOOOJJ1222	3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 33  	" "!!}$!!tsD   )D A#DDDAG!A#GGGGGGN)Fr   )FFrz   F)	r   r   r   r   r   r   r   r   r  r   r   r   r   r   7  s        % % % chC C) C C CSW C C C CJ     r   r   c                       fd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                 $    t          |           S r   )r   r   s    r   <lambda>zperf_report.<locals>.<lambda>  s    b*-- r   r   )r   wrappers   ` r   perf_reportr    s     .---GNr   c                     ddl }ddlm} | s|j                                        } |j        j                            |           d         }|j        j                            |           d         }||z  dz  dz  d	z  }|S )
z return DRAM bandwidth in GB/s r   Nr   rp   mem_clock_ratemem_bus_widthr   g    .A   )rI   r	   rp   rJ   current_devicerq   utilsget_device_properties)devicerI   rp   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr    s    LLL -**,,M'==fEEFVWM#99&AA/RIi'!+c1A5GNr   c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                            |          }|d         dk     r| |j	        k    sJ d}ni| |j
        |j        fv rd}nV| |j	        |j        |j        fv rd}n=| |j        t          j        t          j        t          j        fv rd	}nt'          d
          ||z  |z  dz  }|S )Nr   r   r  multiprocessor_count   r     i   i   dtype not supported&.>)rI   r	   rp   rJ   r  rq   r  r  get_device_capabilityfloat16float32int32r   int16int8tl
float8e4nvfloat8e4b15float8e5RuntimeError	r   
clock_rater  rI   rp   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr2    s(   LLL -**,,=&<<VDDE[\_``L11&99J!}q%%%%U]EK000"u}enekBBB"uz2="."+NNN#4555J&)99D@FMr   c                        fd}|S )Nc                 J     t          j                    fd            }|S )Nc                  p   dd l }|                    t          j                                                              }
                                |                                k    }|r|dk    rt          j                            j        d                   }t          j	        d         dd}d|v s
J d            |d         j
        j        j        }| d	j         d
| d}t          j        ddd|gd|          }	|	j        dk    s
J d            dt#          |	j                  v sJ d S  | i | d S )Nr   zcuda-memcheck__file__PATH1)r7  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r  
returncoder   r   )r   r  r@  	ppid_namerun_cuda_memcheckr   r?  test_idr!   r"   target_kwargstest_fns             r   r  z1cuda_memcheck.<locals>.decorator.<locals>.wrapper  sW   MMMrz||4499;;I - 3 3 5 5 G  )Y/%A%Aw''(;J(GHH!z&1UXYY F***,n*** +09<>>!1>>G>>> nox%L]agjkkk~***,e***0C
OOCCCCCC((((((r   )	functoolswraps)rP  r  rO  s   ` r   	decoratorz cuda_memcheck.<locals>.decorator  s>    		!	!	) 	) 	) 	) 	) 
"	!	)" r   r   )rO  rS  s   ` r   cuda_memcheckrT    s$        , r   F    c           	   #     K   	 t          j        g d           t          j        dddd|  d|  g           t          j        dddd| d| g           t          dg          d	         }t          d
g          d	         }t          || z
            dk     sJ d|  d            t          ||z
            dk     sJ d| d            d| z  }d|z  dz  }||fV  t          j        g d           t          j        g d           t          j        g d           d S # t          j        g d           t          j        g d           t          j        g d           w xY w)N)r   r   r   -pmr8  r   r   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryrH   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rX  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r$   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr1  gbpss         r   set_gpu_clockr_    s     C E E EFFF>>>>>	!
 	 	 	 	CMCCMCC	!
 	 	 	 1233A66788;<,.//"4446_\6_6_6_444==011B6668b}8b8b8b666)L8&-dl E E EFFF A A ABBB A A ABBBBB 	 E E EFFF A A ABBB A A ABBBBs   CD! !AE%c                    dd l }ddlm} |s|j                                        }|j        j                            |          d         dz  }|j                                        }|d         dk     r+| |j	        k    rd}nM| |j
        k    rd}n?t          d	          | |j	        k    rd}n"| |j
        |j        fv rd}nt          d	          ||z  |z  d
z  }|S )Nr   r   r  r  r  r      @   r  r   )rI   r	   rp   rJ   r  rq   r  r  r!  r#  r"  r+  r   r,  s	            r   get_max_simd_tflopsrc    s   LLL -**,,=&<<VDDE[\_``L1133J!}qEM!!!em##!4555EM!!!u}en555!4555J&)99D@FMr   )rB   NNr;   )rc   rd   NNr;   )NNrz   r   )rU  rV  )rQ  r(   r   r=   r   r   
contextlibr   typingr   r   r   rz   r   r'  r	   r$   r6   rA   rb   ry   r   r   r   r  r  r2  rT  r_  rc  r   r   r   <module>rf     s        				         



 % % % % % % " " " " " " " " " "              ( ( ( ( ( ($@B @B @B @BF?@ ?@ ?@ ?@D0^ 0^ 0^ 0^f@ @ @ @ @ @ @ @F` ` ` ` ` ` ` `F  
 
 
 
   :  6 C C C C8     r   