
    Xh
K                         d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ d
efdZde
j         fdZ! ej"                    d             Z# ej"                    de$fd            Z%de$fdZ& ej"                    de$fd            Z' ej"        d          d             Z(de$fdZ) ed           G d d                      Z* G d de          Z+dS )    )BaseBackend	GPUTargetLanguage)irpassesllvmnvidia)knobs)
PTXASError)	dataclassN)AnyDictTupleOptional)
ModuleType)Pathtargetc                 L    dt           t          t          t          f         fd}|S )Nreturnc                 f    | j         j        }|j         j        }||k    s
J d            |dk    rdS dS )Nz%lhs and rhs bitwidth must be the same   )   r       )r   r   r   )scalarprimitive_bitwidth)lhs_typerhs_typelhs_bitwidthrhs_bitwidths       q/var/www/tools.fuzzalab.pt/emblema-extractor/venv/lib/python3.11/site-packages/triton/backends/nvidia/compiler.pycheck_dot_compatibilityz-min_dot_size.<locals>.check_dot_compatibility   sE    99|+++-T+++1<<    )r   int)r   r!   s     r    min_dot_sizer$      s0     uS#s]7K         #"r"   r   c                  $    t           j        j        S N)r
   r	   ptxas r"   r    	get_ptxasr)   !   s    <r"   c                      t           j        j        } | | S t          j        t                      j        dg                              d          }|S )Nz	--versionutf-8)r
   r	   mock_ptx_version
subprocesscheck_outputr)   pathdecode)mock_verversions     r    get_ptxas_versionr3   %   sF    |,H%y{{'7&EFFMMgVVGNr"   c                    t          | t                    sJ t          t          |                     d                    \  }}|dk    r|dk     rd|z   S d|z   dz
  S |dk    rd|z   S |dk    rd	|z   S t          d
| z             )zK
    Get the highest PTX version supported by the current CUDA driver.
    .      P         F   
   ?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )
isinstancestrmapr#   splitRuntimeError)cuda_versionmajorminors      r    ptx_get_versionrF   .   s    
 lC(((((sL..s3344LE5{{199::>!{{Ez{{Ez
X[gg
h
hhr"   archc                 \    | j         }|"t                      j        }t          |          }|S r&   )ptx_versionr)   r2   rF   )optionsrG   rI   rC   s       r    get_ptx_version_from_optionsrK   A   s/    %K {{*%l33r"   c                 P    t          | |          }t          d|          }d| }|S )NV   z+ptx)rK   min)rJ   rG   rI   llvm_ptx_versionfeaturess        r    get_featuresrQ   I   s6    .w==K 2{++(&((HOr"   c                     t          | d          5 }t          j        |                                                                          cd d d            S # 1 swxY w Y   d S )Nrb)openhashlibsha256read	hexdigest)r/   fs     r    	file_hashrZ   W   s    	dD		 4Q~affhh''11334 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4s   8AAA
capabilityc                 $    | dk    rdnd}d|  | S )NZ   a sm_r(   )r[   suffixs     r    sm_arch_from_capabilityrb   ]   s(    "$$SS"F%%V%%%r"   T)frozenc                   ~   e Zd ZU dZeed<   dZeed<   dZeed<   dZe	e         ed<   d	Z
eed
<   dZeed<   dZeed<   dZe	e         ed<   dZeed<   dZeed<   dZeed<   dZee         ed<   dZee         ed<   dZeed<   dZee         ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed <   dZeed!<   d" Z d# Z!dS )$CUDAOptions   	num_warpsr9   num_ctas   
num_stagesNmaxnreg)r9   r9   r9   cluster_dimsrI   ptx_optionsir_overrideTenable_fp_fusionFlaunch_cooperative_grid
launch_pdl)fp8e5fp8e4b15supported_fp8_dtypesr(   !deprecated_fp8_dot_operand_dtypestf32default_dot_input_precision)rv   tf32x3ieeeallowed_dot_input_precisionsmax_num_imprecise_acc_defaultextern_libsdebugcudabackend_namesanitize_overflowrG   c                    t          t                    j        dz  }| j        i nt	          | j                  }|                    dd           s&t          j        j        pt          |dz            |d<   t                              | dt          |                                                     | j        dk    r| j        | j        dz
  z  dk    s
J d            d S )Nlib	libdevicezlibdevice.10.bcr|   r   r9   znum_warps must be a power of 2)r   __file__parentr|   dictgetr
   r	   libdevice_pathr?   object__setattr__tupleitemsrg   )selfdefault_libdirr|   s      r    __post_init__zCUDAOptions.__post_init__}   s    h.6 ,4bb$t?O:P:P{D11 	n',|'B'mc.[lJlFmFmK$4k6G6G6I6I0J0JKKK~!!t~!9K'LQR&R&R&R/ 'S&R&R&R&Rr"   c                 v   t          | j                  }t          d t          |d                   D                       |d<   d                    d t          |                                          D                       }t          j        |                    d                    	                                S )Nc              3   >   K   | ]\  }}|t          |          fV  d S r&   )rZ   ).0kvs      r    	<genexpr>z#CUDAOptions.hash.<locals>.<genexpr>   s1      (h(htq!!Yq\\):(h(h(h(h(h(hr"   r|   _c                 "    g | ]\  }}| d | S )-r(   )r   namevals      r    
<listcomp>z$CUDAOptions.hash.<locals>.<listcomp>   s&    SSSID#4#SSSr"   r+   )
r   __dict__r   sortedjoinr   rU   rV   encoderX   )r   	hash_dictkeys      r    hashzCUDAOptions.hash   s    ''	#((h(hviXeNfGgGg(h(h(h#h#h	- hhSS	@Q@Q9R9RSSSTT~cjj1122<<>>>r"   )"__name__
__module____qualname__rg   r#   __annotations__rh   rj   rk   r   rl   r   rI   rm   r?   rn   ro   boolrp   rq   rt   r   ru   rw   rz   r{   r|   r   r}   r   r   rG   r   r   r(   r"   r    re   re   c   s        IsHcJ "GXc]!!!#L%###KK!%K#%%%!d!!!$)T)))J'<%*<<<46%uSz666'----/I %*III*.!4...KE4L#"t"""D#0 0 0? ? ? ? ?r"   re   c                   
    e Zd Zedefd            Zd ZdefdZdeddf fdZ	de
fdZd	 Zd
 Zdeeef         fdZd Zed             Zed             Zd Zd Zd Zd Zd Z ej                    d             Z xZS )CUDABackendr   c                     | j         dk    S )Nr~   )backend)r   s    r    supports_targetzCUDABackend.supports_target   s    ~''r"   c                     d}t          j        ||          }|st          d|           t          |                    d                    S )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r9   )re	fullmatch
ValueErrorr#   group)r   rG   patternmatchs       r    _parse_archzCUDABackend._parse_arch   sP    Wd++ 	SQQQRRR5;;q>>"""r"   r   c                 @    |                      |j                  }d| S )Ncuda:)r   rG   )r   rJ   r[   s      r    get_target_namezCUDABackend.get_target_name   s%    %%gl33
#z###r"   Nc                 X    t                                          |           d| _        d S )Ncubin)super__init__
binary_ext)r   r   	__class__s     r    r   zCUDABackend.__init__   s&       !r"   c                 @   dt           j        j        pd| j        j         i}|                    fdt          j                                        D                        t          | 
                    |d                             }d|vrSt          t          j                  }|dk    r|                    d           t          t          |                    |d<   d|vr|dk    rd	|d<   d
|vrt           j        j        |d
<   |dk    rdnd|d<   t          di |S )NrG   smc                 :    i | ]}|v |         ||         S r&   r(   )r   r   optss     r    
<dictcomp>z-CUDABackend.parse_options.<locals>.<dictcomp>   s7    uuuATUY]T]T]aefgahatQQatatatr"   rt   Y   fp8e4nvru   r]   )rs   ro   i   @r   r{   r(   )r
   runtimeoverride_archr   rG   updatere   __dataclass_fields__keysr#   r   setrt   addr   r   languagedefault_fp_fusion)r   r   argsr[   rt   s    `   r    parse_optionszCUDABackend.parse_options   s:   3N7NDK<L7N7NOuuuu)I)N)N)P)Puuuvvv))$v,7788
!--#&{'G#H#H R$((333+08L1M1M+N+ND'(.d::R<J89T))',~'GD#$9Cr9I9Iq,-""T"""r"   c                 r    |j         |j        |j        |j        d         |j        d         |j        d         fS )Nr   r9      )rg   rh   sharedrl   )r   metadatas     r    pack_metadatazCUDABackend.pack_metadata   s>    O!!$!!$!!$
 	
r"   c                     dd l mc mc m} t	          |                     |j                            }|dk    r|j        n|j        t          | j
                  d}|S )Nr   r8   )convert_custom_typesr$   )triton.language.extra.cudar   extrar~   r#   r   rG   convert_custom_float8_sm80convert_custom_float8_sm70r$   r   )r   rJ   r~   r[   codegen_fnss        r    get_codegen_implementationz&CUDABackend.get_codegen_implementation   s~    111111111111))',7788
 0:R/?/?D++TEd%%
 

 r"   c                     ddl m} d|iS )Nr   )r   ztriton.language.extra.libdevice)r   r   )r   r   s     r    get_module_mapzCUDABackend.get_module_map   s    88888819==r"   c                 .    t          j        |           d S r&   )r	   load_dialects)r   ctxs     r    r   zCUDABackend.load_dialects   s    S!!!!!r"   c                    t          j        | j                  }|                                 t          j                            |           t          j                            |           |dz  dk     rt          j        	                    |           t          j        
                    |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           |                    |            | S )Nr<   	   )r   pass_managercontextenable_debugr   commonadd_inlinerttiradd_rewrite_tensor_pointer(add_rewrite_tensor_descriptor_to_pointeradd_canonicalizeradd_combineadd_reorder_broadcastadd_cseadd_symbol_dceadd_loop_unrollrun)modr   optr[   pms        r    	make_ttirzCUDABackend.make_ttir   s   _S[))
!!"%%%..r222aK@@DDD''+++###))"---b!!!$$R(((##B'''
s
r"   c                 .   |j         E|                     dt          j        | j                                      |j                              t          j                    }|j        6|j        d         |_	        |j        d         |_
        |j        d         |_        t          j        | j                  }|                                }t          j                            |d| |j        d|j                   t          j                            |           |dz  dk    rt          j                            |           t          j        j                            ||           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j                            ||d	k               t          j        j                            |           t          j                            |           |dz  d
v r2t          j                            |           t          j                             |           t          j        !                    |           t          j                             |           t          j        "                    |           t          j        j#        $                    ||j%        |           t          j        &                    ||j%                   t          j        '                    |           t          j        (                    ||j%        |           n|dz  dk    rt          j                            |           t          j                             |           t          j        !                    |           t          j        )                    |           t          j        *                    |           t          j        j        +                    |           t          j        &                    ||j%                   t          j        '                    |           t          j        ,                    ||j%                   t          j        (                    ||j%        |           t          j        "                    |           t          j        j        -                    |           nt          j        !                    |           t          j                             |           t          j                            |           t          j        .                    |           t          j                            ||d	k               t          j        /                    |           t          j        j        0                    |           t          j                            |           t          j        j        1                    |           t          j        2                    |           t          j        3                    |           t          j                            |           t          j        4                    |           |dz  dk    rHt          j        j        5                    |           t          j        j        6                    |           t          j        7                    |           t          j                             |           |8                    |            |j	        |j
        |j        f|d<   | 9                                }||d<   | S )Nzttg.maxnregr   r9   r   r   r   r<   r   r8   )r   r   r   rl   tensordesc_meta):rk   set_attrr   builderr   get_int32_attrr	   ClusterInforl   clusterDimXclusterDimYclusterDimZr   r   r   r   add_convert_to_ttgpuirrg   rh   ttgpuiradd_coalesceadd_f32_dot_tc	ttnvgpuiradd_plan_ctaadd_remove_layout_conversionsadd_optimize_thread_localityadd_accelerate_matmuladd_optimize_dot_operands add_optimize_descriptor_encodingadd_loop_aware_cseadd_fuse_nested_loopsr   r   add_triton_licm add_combine_tensor_select_and_ifhopperadd_hopper_warpspecrj   add_assign_latenciesadd_schedule_loopsadd_pipelineadd_optimize_accumulator_initadd_hoist_tmem_allocadd_promote_lhs_to_tmemadd_warp_specializeadd_remove_tmem_tokensadd_prefetchadd_coalesce_async_copyadd_optimize_tmem_layoutsadd_interleave_tmemadd_reduce_data_duplicationadd_reorder_instructionsr   add_tma_loweringadd_fence_insertionadd_sccpr   get_tensordesc_metadata)r   r   r   r[   cluster_infor   dump_enabledr   s           r    
make_ttgirzCUDABackend.make_ttgir   s    ;"LL
3;(?(?(N(Ns{([([\\\)++''*'7':L$'*'7':L$'*'7':L$_S[))((**2/Cz/C/CS]TVX[Xdeee##B'''q  N))"---,,R>>>44R88833B777,,R00044R88800Z25EFFF@@DDD&&r***v%%N00444M++B///K''+++M++B///N;;B???M 44RVVVN//CNCCCN--b111N''CNLIIII2##N00444M++B///K''+++N88<<<N//333M#;;B???N//CNCCCN--b111N..r3>BBBN''CNLIIIN;;B???M#::2>>>>K''+++''+++&&r***##B'''00Z25EFFF..r22299"===44R88833B777222666//333&&r***$$R(((q  M#44R888M#77;;;r"""''+++
s$0$<l>VXdXp#q 5577&5"#
r"   c                    |}t          j        |j                  }|                                 t          j                            |           t          j                            |           t          j	        
                    |           t          j                            |           t          j                            |           |                    |           |                                |d<   |S )Nr   )r   r   r   r   r   r  r   r   r!  r   r  r   r  r   r"  )r   srcr   rJ   r[   r   r   s          r    	ttgir_optzCUDABackend.ttgir_opt0  s    _S[))
""2&&&r"""&&r***((,,,77;;;
s&)&A&A&C&C"#
r"   c                 4   t          || j        j                  }|}t          j        |j                  }|                                 t          j        j	        
                    |           t          j                            |           t          j                            |           t          j                            |           t          j                            |           t          j        j	                            |           t          j                            |           t          j        j                            |||           t          j                            |           t          j                            |           t          j        j	                            |           t          j        j	                            |           t          j                            |           t          j                            |           t          j                            |           t4          j        j        st          j                            |           |                    |           tA          j!                     tA          j                    }t4          j        j"        rtG          d          tA          j$        ||          }	tK          |          }
tM          || j        j                  }d}t          j'                     tA          j(        |	||
|           t          j)        |	           |j*        r&d |j*        D             }tA          j+        |	|           tA          j,        |	t@          j-                   |.                    d          }|||d<   |.                    d          |d<   |.                    d          |d	<   |.                    d
          |d<   |.                    d          |d<   t_          |	          }~	~|S )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendnvptx64-nvidia-cudac                     g | ]\  }}|S r(   r(   )r   r   r/   s      r    r   z)CUDABackend.make_llir.<locals>.<listcomp>h  s    BBBltTTBBBr"   zttg.total-num-warpsrg   z
ttg.sharedr   zttg.tensor_memory_size	tmem_sizezttg.global_scratch_memory_sizeglobal_scratch_sizez#ttg.global_scratch_memory_alignmentglobal_scratch_align)0rK   r   rG   r   r   r   r   r	   r   r  add_lower_mmar  r  add_allocate_warp_groupsconvertadd_scf_to_cfadd_allocate_shared_memoryadd_allocate_tensor_memory"add_allocate_global_scratch_memoryadd_to_llvmirr   r   r   add_nvgpu_to_llvmadd_warp_specialize_to_llvmr   r
   compilationdisable_line_infollvmiradd_di_scoper   r   init_targetsenable_asanrB   	to_modulerb   rQ   set_short_ptrattach_datalayoutset_nvvm_reflect_ftzr|   link_extern_libsoptimize_moduleOPTIMIZE_O3get_int_attrr?   )r   r'  r   rJ   r[   rI   r   r   r   llvm_modprocrP   triplepathstotal_num_warpsrets                   r    	make_llirzCUDABackend.make_llir?  sf   27DK<LMM_S[))
--b11177;;;//333$$R(((11"555::2>>>99"===++B
KHHH''+++b!!!11"555;;B???''+++b!!!$$R((( 2 	+M&&r***
s,..( 	mkm m m>#w//&z22)9::&xx@@@#H--- 	3BBg.ABBBE!(E222Xt'7888 **+@AA&$3H[! --l;; # 0 01I J J*-*:*:;[*\*\&'+.+;+;<a+b+b'((mm
r"   c           	      p   t          || j        j                  }d}t          |          }t	          || j        j                  }t          j        ||||g |j        d          }	t          j	        d|	          }
t          |
          dk    sJ |
d         |d<   |dz   d|dz   }t          j        d	d
| |	t          j                  }	t          j        dd| |	t          j                  }	t          j        dd|	          }	t          j        j        rt!          d           t!          |	           |	S )Nr*  Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r9   r   r   r<   r5   z\.version \d+\.\d+z	.version )flagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*r_   z // -----// NVPTX Dump //----- //)rK   r   rG   rb   rQ   r   translate_to_asmro   r   findalllensub	MULTILINEr
   r	   
dump_nvptxprint)r   r'  r   r   r[   rI   rI  rH  rP   rL  namess              r    make_ptxzCUDABackend.make_ptx{  s6   238HII&&z22T[%566#CxSEY[`aa
FLL5zzQ 8$b;;;r>;;f*,E,E,EsRTR^___f')Cz)C)CSPRP\]]]f+R55<" 	4555#JJJ
r"   c                 L   t                      j        }t          j        ddd          5 }t          j        ddd          5 }|                    |           |                                 |j        dz   }t          j        j	        rdd	gndg}	|j
        rg nd
g}
t          |          }t          j        j        rddgng }|j        r|j                            d          ng }|g|	|
d||d| |j        d|}	 t!          j        |dd|           t$          j                            |j                  rt%          j        |j                   t$          j                            |j                  rt%          j        |j                   n# t           j        $ r}t-          |j                  5 }|                                }d d d            n# 1 swxY w Y   t$          j                            |j                  rt%          j        |j                   |j        dk    rd}n%|j        dt2          j        z   k    rd}n
d|j         }t7          | d| dd                    |           d          d }~ww xY wt-          |d          5 }|                                }d d d            n# 1 swxY w Y   t$          j                            |          rt%          j        |           d d d            n# 1 swxY w Y   d d d            n# 1 swxY w Y   |S )NFwz.ptx)deletemodera   rz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-level0 z-vz--gpu-name=z-oT)check	close_fdsstderr   z!Internal Triton PTX codegen error   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: 
rS   )r)   r/   tempfileNamedTemporaryFilewriteflushr   r
   r9  r:  ro   rb   r	   disable_ptxas_optrm   rA   r-   r   osexistsremoveCalledProcessErrorrT   rW   
returncodesignalSIGSEGVr   r   )r   r'  r   r   r[   r'   fsrcflogfbin	line_infofmadrG   disable_optptx_extra_options	ptxas_cmdelog_filelogerrorrY   r   s                        r    
make_cubinzCUDABackend.make_cubin  sM    (COOO .	 SW'u3vNNN.	 RVJJsOOOJJLLL9t#DAFARAdw&<==kvjwI-C22N3CD*:66D 38,2PX=#..VXK ?Bo U 5 5c : : :SU !$(*.1<?PRf`dRfRfhlhqswILydSSSS7>>$),, )Idi(((7>>$),, )Idi(((0 L L L$)__ *"--//C* * * * * * * * * * * * * * *7>>$),, )Idi(((<3&&?EE\S6>%9994EELalLLE E "K "K58"K "K3688I3F3F"K "K "K L L LL" dD!! !Q! ! ! ! ! ! ! ! ! ! ! ! ! ! !w~~d##  	$].	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	  .	 ^ s   LB>LBFLJ$I?8G	I?GI? G!BI??JLJ8,L8J<<L?J< 6L6LL	L	L	
LL Lc                                            j                  |t          j        k    r fd|d<    fd|d<   n|t          j        k    r
 fd|d<    fd|d<    fd|d	<    fd
|d<   d S )Nc                 4                         | |          S r&   )r   r'  r   r[   rJ   r   s     r    <lambda>z(CUDABackend.add_stages.<locals>.<lambda>  s    4>>#xQXZd3e3e r"   r   c                 4                         | |          S r&   )r%  r  s     r    r  z(CUDABackend.add_stages.<locals>.<lambda>  s    DOOCSZ\f4g4g r"   ttgirc                 4                         | |          S r&   )r(  r  s     r    r  z(CUDABackend.add_stages.<locals>.<lambda>  s    DNN3RY[e4f4f r"   c                 4                         | |          S r&   )rM  r  s     r    r  z(CUDABackend.add_stages.<locals>.<lambda>  s    t~~c8WV`/a/a r"   llirc                 H                         | |j        j                  S r&   )rX  r   rG   r'  r   rJ   r   s     r    r  z(CUDABackend.add_stages.<locals>.<lambda>  s     dmmC7TXT_Td.e.e r"   ptxc                 H                         | |j        j                  S r&   )r~  r   rG   r  s     r    r  z(CUDABackend.add_stages.<locals>.<lambda>  s     XwX\XcXh0i0i r"   r   )r   rG   r   TRITONGLUON)r   stagesrJ   r   r[   s   ` ` @r    
add_stageszCUDABackend.add_stages  s    %%gl33
x&&eeeeeeF6NggggggF7OO''ffffffF7Oaaaaaaveeeeeuiiiiiwr"   c                 @    t                      }| d| j        j         S )Nr   )r3   r   rG   )r   r2   s     r    r   zCUDABackend.hash  s&    #%%..DK,...r"   )r   r   r   staticmethodr   r   r   r?   r   r   r   r   r   r   r   r   r   r   r   r%  r(  rM  rX  r~  r  	functools	lru_cacher   __classcell__)r   s   @r    r   r      s       (	 ( ( ( \(# # #$# $ $ $ $"y "T " " " " " "#S # # # #,
 
 
  >S*_ 5 > > > >" " "   \  H H \HT  : : :x  ,1 1 1f	j 	j 	j Y/ / / / / / /r"   r   ),triton.backends.compilerr   r   r   triton._C.libtritonr   r   r   r	   tritonr
   triton.runtime.errorsr   dataclassesr   r  typingr   r   r   r   typesr   rU   r   rf  rp  rk  r-   pathlibr   r$   
NvidiaToolr)   r  r3   r#   rF   rK   rQ   rZ   rb   re   r   r(   r"   r    <module>r     s   E E E E E E E E E E 8 8 8 8 8 8 8 8 8 8 8 8       , , , , , , ! ! ! ! ! !     - - - - - - - - - - - -        				   				          # # # # #5#        iS i i i i$     
 
 
 
 
 T4 4 4
& & & & & $'? '? '? '? '? '? '? '?TD/ D/ D/ D/ D/+ D/ D/ D/ D/ D/r"   