
    Yhgp                        U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZddgZ ed          Z ed          Ze j        j        Zd Zi Z e!eef         e"d<   d Z#d;deeeef         geeef         f         fdZ$ e$ej%                  ddde&fd            Z' e$ej(                  d<de&fd            Z) e$ej*                  d<de&fd            Z+ e$ej,                  d<de&fd            Z- e$ej.                  	 	 	 	 	 d=de&fd            Z/	 d;de0e&         de0e&         de0e&         de1de&f
d Z2 e$ej3        ej4        g          ddde&fd!            Z5 e$ej6                  de&fd"            Z7d# Z8 e$ej9        ej:        ej;        g          ddde&fd$            Z<d% Z=dd&dee>e>e&d'f         e>e&d'f         e>e&d'f         e	e>e&d'f                  f                  fd(Z?dd&dee>e>e&d'f         e>e&d'f         e>e&d'f         e	e>e&d'f                  f                  fd)Z@ e$ejA        d*+          ddde&fd,            ZB e$ejC        d*+          de&fd-            ZDd. ZE e$ejF        ejG        ejH        g          ddde&fd/            ZI e$ejJ        d*+          de&fd0            ZK e$ejL        d*+          de&fd1            ZMi ej%        e'ej(        e)ej*        e+ej,        e-ej.        e/ej3        e5ej4        e5ej6        e7ej9        e<ej:        e<ej;        e<ejF        eIejG        eIejH        eIejA        eBejC        eDejJ        eKejL        eMiZ d2 ZNg d3ZOd4 ZPd5 ZQd6 ZRd7 ZS G d8 d          ZT G d9 d:e          ZUdS )>    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyOptionalUnionTypeVarCallable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 H    t          | t          j                  r| j        S | S N)
isinstancetorchTensorshape)is    j/var/www/tools.fuzzalab.pt/emblema-extractor/venv/lib/python3.11/site-packages/torch/utils/flop_counter.py	get_shaper!      s"    !U\"" wH    flop_registryc                 B     t                     d d fd
            }|S )N)out_valc                 P    t          t          ||| f          \  }}} |d|i|S )N	out_shape)r   r!   )r%   argskwargsr'   fs       r    nfzshape_wrapper.<locals>.nf   s:    "*9tVW6M"N"Nfiq$6)6v666r"   r   r*   r+   s   ` r    shape_wrapperr-      s@    
1XX 7 7 7 7 7 7 X7 Ir"   Freturnc                 |     dt           t          t          f         dt           t          t          f         f fd}|S )Nflop_formular.   c                      st                       fd}t          j        j                            |            S )Nc                     t          | t          j        j                  s"t	          d|  dt          |                      | t          v rt          d|            t          | <   d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   _opsOpOverloadPacket
ValueErrortyper#   RuntimeError)targetr0   s    r    registerz=register_flop_formula.<locals>.register_fun.<locals>.register(   s    fej&ABB A @@ @15f@ @A A A &&"#J&#J#JKKK$0M&!!!r"   )r-   r   utils_pytree	tree_map_)r0   r9   get_rawtargetss   ` r    register_funz+register_flop_formula.<locals>.register_fun$   sU     	7(66L	1 	1 	1 	1 	1 	%%h888r"   )r   r   r   )r>   r=   r?   s   `` r    r   r   #   sO    8BF#3 R8H       & r"   )r'   c                <    | \  }}|\  }}||k    sJ ||z  dz  |z  S )zCount flops for matmul.    )	a_shapeb_shaper'   r(   r)   mkk2ns	            r    mm_floprI   9   s5    
 DAqEB7777q519q=r"   c                 "    t          ||          S )zCount flops for addmm.rI   
self_shaperC   rD   r'   r)   s        r    
addmm_floprN   D   s     7G$$$r"   c                 Z    | \  }}}|\  }}}	||k    sJ ||k    sJ ||z  |	z  dz  |z  }
|
S )z"Count flops for the bmm operation.rA   rB   )rC   rD   r'   r)   brE   rF   b2rG   rH   flops              r    bmm_floprS   I   sO    
 GAq!IBA77777777q519q=1DKr"   c                 "    t          ||          S )z&Count flops for the baddbmm operation.rS   rL   s        r    baddbmm_floprV   V   s    
 GW%%%r"   c	                 "    t          | |          S )zCount flops for _scaled_mm.rK   )
rC   rD   scale_a_shapescale_b_shape
bias_shapescale_result_shape	out_dtypeuse_fast_accumr'   r)   s
             r    _scaled_mm_flopr^   ]   s     7G$$$r"   x_shapew_shaper'   
transposedc                     | d         }|r| n|dd         }|^}}}	 t          |          t          |          z  |z  |z  |z  dz  }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   rA   Nr   )
r_   r`   r'   ra   
batch_size
conv_shapec_outc_infilter_sizerR   s
             r    conv_flop_countrh   n   sj    * J'6''Y;J 'E4+ 
d;///*<uDtKaODKr"   c                (    t          | |||          S )zCount flops for convolution.ra   )rh   )
r_   r`   _bias_stride_padding	_dilationra   r'   r(   r)   s
             r    	conv_flopro      s     7GY:NNNNr"   c                 |   d }d}	 |
d         r+t          |d                   }|t          | |||           z  }|
d         rzt          |d                   }|r2|t           ||            ||           ||          d          z  }n1|t           ||           ||            ||          d          z  }|S )Nc                 R    | d         | d         gt          | dd                    z   S )Nr   r   rA   )list)r   s    r    tzconv_backward_flop.<locals>.t   s(    a%(#d59oo55r"   r   r   Frj   )r!   rh   )grad_out_shaper_   r`   rk   rl   rm   rn   ra   _output_padding_groupsoutput_maskr'   rs   
flop_countgrad_input_shapegrad_weight_shapes                   r    conv_backward_flopr{      s    6 6 6JDL 1~ a$Yq\22ong?OU_Q_```
1~ q%il33 	q/!!N*;*;QQwZZK\I]I]joppppJJ /!!G**aa6G6GK\I]I]joppppJr"   c                    | \  }}}}|\  }}}	}
|\  }}}}||cxk    r|k    r%n n"||cxk    r|k    rn n||
k    r|	|k    r||
k    sJ d}|t          ||z  ||f||z  ||	f          z  }|t          ||z  ||	f||z  |	|f          z  }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rU   )query_shape	key_shapevalue_shaperP   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopss                   r    sdpa_flop_countr     s     !NAq#s"Cc3$Cc3????s?????qC33#::#**QTX[Q[Q[Q[Q[K8QUC-AsC/@AAAK8QUC-AsC/@AAAKr"   c                $    t          | ||          S )Count flops for self-attention.r   )r}   r~   r   r'   r(   r)   s         r    	sdpa_flopr     s     ;	;???r"   c                     ddl m} ddlm} t	          | ||f          s6| j        j        dk    r&|                                                                 S |g| 	                    d          dz
  z  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
torch._subclasses.fake_tensorr   #torch._subclasses.functional_tensorr   r   devicer6   difftolistsize)offsetsmax_lenr   r   s       r    _offsets_to_lengthsr     s    
 988888DDDDDDg
,<=>> '7>CVZ`C`C`||~~$$&&&9Q!+,,r"   )grad_out.c              #     K   |t          |j                  dk    sJ t          |j                  dk    sJ ||j        | j        k    sJ | j        \  }}	}
|j        \  }}}|j        \  }}}|J |J |j        |j        k    sJ t          ||          }t          ||          }t          ||          D ]%\  }}d|	||
f}d|||f}d|||f}||nd}||||fV  &dS | j        |j        |j        ||j        ndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   r   zip)querykeyvaluer   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr   h_kd_kh_vr   seq_q_lengthsseq_k_lengths	seq_q_len	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shapes                          r    %_unpack_flash_attention_nested_shapesr   *  st     $  39~~""""5;1$$$$8>U[#@#@#@#@k3i3k3$$$$$$)/1111+Iu==+Iu==&)-&G&G 	V 	V"Y	 #y#6OY4M #y#6O4<4Hd!=/CUUUUUU
+sy%+AUx~~[_
______r"   c              #   
  K   |t          |j                  dk    sJ t          |j                  dk    sJ ||j        | j        k    sJ | j        \  }}}	}
|j        \  }}}}|j        \  }}}}|J |J |j        |j        k    sJ t          ||          }t          ||          }t          ||          D ]%\  }}d|	||
f}d|||f}d|||f}||nd}||||fV  &dS | j        |j        |j        ||j        ndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   r   )r   r   r   r   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r   r   r   r   r   	seqlens_q	seqlens_klen_qlen_kr   r   r   r   s                          r    )_unpack_efficient_attention_nested_shapesr   X  s{     $  39~~""""5;1$$$$8>U[#@#@#@#@1c31c31c3''''''!\%77777'lCC	'lCC		955 	V 	VLE5 #uc2OUC0M #uc2O4<4Hd!=/CUUUUUU
+sy%+AUx~~[_
______r"   T)r=   c          	      `    t          | ||||||          }
t          d |
D                       S )r   )r   r   r   r   r   r   r   c              3   B   K   | ]\  }}}}t          |||          V  d S r   r   .0r}   r~   r   r   s        r    	<genexpr>z0_flash_attention_forward_flop.<locals>.<genexpr>  J        2KK 	Y<<     r"   r   sum)r   r   r   r   r   r   r   r'   r(   r)   sizess              r    _flash_attention_forward_flopr     s]    " 2  E   6;     r"   c           	      `    t          | ||||||          }
t          d |
D                       S )r   )r   r   r   r   r   r   r   c              3   B   K   | ]\  }}}}t          |||          V  d S r   r   r   s        r    r   z4_efficient_attention_forward_flop.<locals>.<genexpr>  r   r"   r   r   )r   r   r   biasr   r   r   r   r(   r)   r   s              r    !_efficient_attention_forward_flopr     s]    " 6!!!!  E   6;     r"   c                    d}|\  }}}}|\  }	}
}}|\  }}}}| \  }}}}||	cxk    r|cxk    r|k    r n n||
cxk    r|cxk    r|k    r	n n||k    sJ ||k    r||k    r||k    sJ d}|t          ||z  ||f||z  ||f          z  }|t          ||z  ||f||z  ||f          z  }|t          ||z  ||f||z  ||f          z  }|t          ||z  ||f||z  ||f          z  }|t          ||z  ||f||z  ||f          z  }|S )Nr   rU   )rt   r}   r~   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   _b4_h4_s4_d4s                        r    sdpa_backward_flop_countr     s   K NAq#s"Cc3$Cc3'Cc3!!!!s!!!!c!!!!!a3&<&<&<&<#&<&<&<&<&<&<&<&<&<#::#**K 8QUC-AsC/@AAAK 8QUC-AsC/@AAAK8QUC-AsC/@AAAK 8QUC-AsC/@AAAK8QUC-AsC/@AAAKr"   c                &    t          | |||          S )z(Count flops for self-attention backward.r   )rt   r}   r~   r   r'   r(   r)   s          r    sdpa_backward_flopr     s    
 $NKKXXXr"   c
           
      b    t          |||| ||||	          }t          d |D                       S )N)r   r   r   r   r   r   r   r   c              3   D   K   | ]\  }}}}t          ||||          V  d S r   r   r   r}   r~   r   rt   s        r    r   z1_flash_attention_backward_flop.<locals>.<genexpr>  L        ?KK 	!iUU     r"   r   )r   r   r   r   out	logsumexpr   r   r   r   r(   r)   shapess                r    _flash_attention_backward_flopr     s`    " 3	 	 	F   CI     r"   c
           
      b    t          |||| ||||	          }t          d |D                       S )N)r   r   r   r   r   r   r   r   c              3   D   K   | ]\  }}}}t          ||||          V  d S r   r   r   s        r    r   z5_efficient_attention_backward_flop.<locals>.<genexpr>&  r   r"   r   )r   r   r   r   r   r   r   r   r   r   r(   r)   r   s                r    "_efficient_attention_backward_flopr     s`    " 7!!!!	 	 	F   CI     r"   c                 6    t          | t                    s| fS | S r   )r   tuple)xs    r    normalize_tupler   A  s     a tHr"   ) KMBTc                     t          dt          t          t                    dz
  t          t	          |                     dz
  dz                      }t          |         S )Nr   r   rA   r   )maxminr   suffixesstr)numberindexs     r    get_suffix_strr   J  sJ     3s8}}q(3s6{{+;+;a+?A*EFFGGEE?r"   c                 j    t                               |          }| d|z  z  d}|t           |         z   S )Ni  z.3f)r   r   )r   suffixr   r   s       r    convert_num_with_suffixr   Q  s6    NN6""E%++E8E?""r"   c                      |dk    rdS | |z  dS )Nr   0%z.2%rB   )numdenoms     r    convert_to_percent_strr   X  s     zztEkr"   c                 <     t                      fd            }|S )Nc                 R    t          |           \  }} | }t          ||          S r   )r   r   )r(   	flat_argsspecr   r*   s       r    r+   z)_pytreeify_preserve_structure.<locals>.nf^  s/    &t,,	4amc4(((r"   r   r,   s   ` r    _pytreeify_preserve_structurer  ]  s3    
1XX) ) ) ) X)
 Ir"   c                        e Zd ZdZ	 	 	 	 ddeeej        j        e	ej        j                 f                  de
dedeeeef                  f fd	Zd
e
fdZd
eeeee
f         f         fdZddZd Zd Zd Z xZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    NrA   Tmodsdepthdisplaycustom_mappingc                 R   t                                                       t          d           | _        || _        || _        d | _        |i }|t          j        dd           i t          d |
                                D             | _	        t                      | _        d S )Nc                  *    t          t                    S r   )r   intrB   r"   r    <lambda>z*FlopCounterMode.__init__.<locals>.<lambda>  s    +VYJZJZ r"   z<mods argument is not needed anymore, you can stop passing itrA   )
stacklevelc                 Z    i | ](\  }}|t          |d d          r|nt          |          )S )_get_rawF)getattrr-   r   rF   vs      r    
<dictcomp>z,FlopCounterMode.__init__.<locals>.<dictcomp>  s<    nnntqRSqwq*e44J!!-:J:Jnnnr"   )super__init__r   flop_countsr  r  modewarningswarnr#   itemsr   mod_tracker)selfr  r  r  r	  	__class__s        r    r  zFlopCounterMode.__init__{  s     	6ABZBZ6[6[
04	!NMXefgggg

nnWeWkWkWmWmnnn
 )??r"   r.   c                 Z    t          | j        d                                                   S )NGlobal)r   r  valuesr  s    r    get_total_flopszFlopCounterMode.get_total_flops  s$    4#H-4466777r"   c                 H    d | j                                         D             S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 4    i | ]\  }}|t          |          S rB   )dictr  s      r    r  z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>  s$    @@@tq!477@@@r"   )r  r  r"  s    r    get_flop_countszFlopCounterMode.get_flop_counts  s(     A@t'7'='='?'?@@@@r"   c                 @   
 | j         }|d}dd l}d|_        g d}g }                                 
t	          
          d
 fd}t           j                                                  D ]L}|dk    r	|                    d          d	z   }||k    r( |||d	z
            }|	                    |           Md j        v r$s"|D ]}	d
|	d         z   |	d<    |dd          |z   }t          |          dk    rg dg}|                    ||d          S )Ni?B r   T)ModuleFLOPz% TotalFc           	         t          
j        |                                                    }	|k    z  	d|z  }g }|                    || z   t	          |          t          |          g           
j        |                                          D ]L\  }}|                    |dz   t          |          z   t	          |          t          |          g           M|S )N z - )r   r  r!  appendr   r   r  r   )mod_namer  r   paddingr!  rF   r  global_flopsglobal_suffixis_global_subsumedr  s          r    process_modz.FlopCounterMode.get_table.<locals>.process_mod  s     d.x8??AABBK+"==EkGFMM("']CC&{LAA   
 (288::  1eOc!ff,+A}==*1l;;    
 Mr"   r   .r   r,  )r   0r   )leftrightr7  )headerscolalign)r  tabulatePRESERVE_WHITESPACEr#  r   sortedr  keyscountextendr   )r  r  r:  headerr!  r3  mod	mod_depth
cur_valuesr   r0  r1  r2  s   `         @@@r    	get_tablezFlopCounterMode.get_table  s   =JE=E'+$...++--&|44"	 	 	 	 	 	 	 	, $*//1122 	& 	&Ch		#*I5  $S)a-88JMM*%%%%
 t'''0B' * *q>a [1--6Fv;;!+++,F  B\ ]]]r"   c                     | j                                          | j                                         t	          |           | _        | j                                         | S r   )r  clearr  	__enter___FlopCounterModer  r"  s    r    rG  zFlopCounterMode.__enter__  sT       ""$$$$T**		r"   c                     | j         J  | j         j        | }d | _         | j                                         | j        r't	          |                     | j                             |S r   )r  __exit__r  r  printrD  r  )r  r(   rP   s      r    rJ  zFlopCounterMode.__exit__  sh    y$$$DI%	!!###< 	.$..,,---r"   c                     || j         v rP| j         |         } ||i |d|i}t          | j        j                  D ]}| j        |         |xx         |z  cc<   |S )Nr%   )r#   setr  parentsr  )r  func_packetr   r(   r)   flop_count_funcrx   pars           r    _count_flopszFlopCounterMode._count_flops  s    $,,,"0=O($F&FF#FFFJ4+344 A A %k222j@2222
r"   )NrA   TNr   )__name__
__module____qualname____doc__r	   r
   r   nnr)  rr   r  boolr&  r   r  r#  r   r'  rD  rG  rJ  rR  __classcell__)r  s   @r    r   r   g  s>        * MQ 7;+ +5$ux2G!GHI+ + 	+
 %T#s(^4+ + + + + +*8 8 8 8 8
Ac4S>&9!: 
A 
A 
A 
A:^ :^ :^ :^z          r"   c                   "    e Zd ZdefdZddZdS )rH  counterc                     || _         d S r   )r[  )r  r[  s     r    r  z_FlopCounterMode.__init__  s    r"   rB   Nc                 $   |r|ni }|t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j	        j        t           j        j        j
        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        t           j        j        j        j        hv rt&          S || j        j        vr\|t           j        j        j        j        ur?| 5   |j        |i |}|t&          ur|cd d d            S 	 d d d            n# 1 swxY w Y    ||i |}| j                            |j        |||          S r   )r   opsatenis_contiguousdefaultmemory_formatis_strides_like_formatis_non_overlapping_and_denser   sym_sizestride
sym_stridestorage_offsetsym_storage_offsetnumel	sym_numeldimprimlayoutNotImplementedr[  r#   r   	decomposerR  _overloadpacket)r  functypesr(   r)   rr   s          r    __torch_dispatch__z#_FlopCounterMode.__torch_dispatch__  s   !)r EIN08IN0>IN9AIN?GIN'/IN+3IN)1IN-5IN19IN5=IN(0IN,4IN&.IN)13 3 3 "! t|111d%).BWB_6_6_  "DND3F33N**       *               dD#F##|(()=sD&QQQs   +GG!$G!)rB   N)rS  rT  rU  r   r  ru  rB   r"   r    rH  rH    sI            R R R R R Rr"   rH  )Fr   )NNNFN)Vr   torch.utils._pytreer   r   r   module_trackerr   typingr   r	   r
   r   r   collections.abcr   typing_extensionsr   collectionsr   torch.utils._python_dispatchr   mathr   	functoolsr   r  __all__r   r   r^  r_  r!   r#   r&  __annotations__r-   r   mmr  rI   addmmrN   bmmrS   baddbmmrV   
_scaled_mmr^   rr   rX  rh   convolution_convolutionro   convolution_backwardr{   r   '_scaled_dot_product_efficient_attention#_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   r   _flash_attention_forwardr   _efficient_attention_forwardr   r   0_scaled_dot_product_efficient_attention_backward,_scaled_dot_product_flash_attention_backward,_scaled_dot_product_cudnn_attention_backwardr   _flash_attention_backwardr   _efficient_attention_backwardr   r   r   r   r   r   r  r   rH  rB   r"   r    <module>r     s    F F F F F F F F F F ) ) ) ) ) ) : : : : : : : : : : : : : : $ $ $ $ $ $ ' ' ' ' ' ' # # # # # # : : : : : :             5
6WT]]Yt__y~  
 !#tCH~ " " "   XxB?O>PRZ[]_a[aRb>b5c    , tw/3   #      tz""% %# % % % #"% tx  
 
C 
 
 
 ! 
 t|$$& &C & & & %$& t'' % % 	% % % ('%( 	% %#Y%#Y% Cy% 	%
 	% % % %N ($*;<==bf O O Oux O O O >=O
 t011e e e e 21eN  $ D@@B C C EI @ @ @WZ @ @ @C C@	- 	- 	-" +` +` +` eE#s(OU38_eCHoxPUVY[^V^P_G``ab+` +` +` +`f -` -` -` eE#s(OU38_eCHoxPUVY[^V^P_G``ab-` -` -` -`` t4dCCC    	   DC> t8$GGG 	   HG>  6 MIIK L L ^b Y Y Yps Y Y YL LY t5tDDD 	   ED@ t94HHH 	   IH@GWJ
 	Hh 	L,	
 	O_ 	i 	y 	1 	0) 	,i 	,i 	9;M 	57I 	57I 	!#@  	%'H!" 	"$B#$ 	&(J% *   $##  # # #     
  L L L L L L L L^"R "R "R "R "R( "R "R "R "R "Rr"   