
    }Yha3                        d dl Z d dlmZ d dlmZmZmZ d dlZd dlm	Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZmZmZmZmZm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z,m-Z-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 e7e8e9eee:                  ee:         f         f         Z;dgZ<d+de:de8de8fdZ=	 d,dee
j>                 defdZ?dej@        deAfdZB	 d+ded ee:         de8dej@        fd!ZCd"ede9e;ee
j>                 f         fd#ZD G d$ d%e          ZE	 d,d&ed'e8d(e*d)ee#         def
d*ZFdS )-    N)Sequence)castOptionalUnion)_get_device_module)ShardedTensor)TensorProperties)Shard)ChunkShardingSpec)unflatten_state_dict)DefaultLoadPlanner)BytesStorageMetadataChunkStorageMetadataMetadataMetadataIndexSTATE_DICT_TYPEr	   TensorStorageMetadata)LoadPlanLoadPlanner)_create_read_items create_read_items_for_chunk_list)load_state_dict)StorageReader)_element_wise_add_element_wise_sub_normalize_device_info)_get_default_group)_create_chunk_sharded_tensor)_remote_device)DTensor!load_sharded_optimizer_state_dictcudaglobal_rankdevice_typereturnc                     |dk    rdS t          |          }|                                r%t          || |                                z            S dS )Ncpu)r   is_availabler   device_count)r#   r$   device_modules      x/var/www/tools.fuzzalab.pt/emblema-extractor/venv/lib/python3.11/site-packages/torch/distributed/checkpoint/optimizer.py_gen_rank_devicer,   6   sb    eu&{33M!!## 
%}'A'A'C'CC
 
 	
 5    pgc           	          t           j                                       j         -fdt	          t          j                              D             }n. fdt	                                                     D             }t          dt          t          t          t          t          f                  |                    S )Nc           	      <    g | ]}d | dt          |           S rank:/)r,   ).0idxpg_device_types     r+   
<listcomp>z(_create_colwise_spec.<locals>.<listcomp>F   sE     
 
 
 BCAA*3??AA
 
 
r-   c                 b    g | ]+}d | dt          t          j        |                     ,S r1   )r,   distget_global_rank)r4   r5   r.   r6   s     r+   r7   z(_create_colwise_spec.<locals>.<listcomp>K   sR     
 
 
 \C[[*4+?C+H+H.YY[[
 
 
r-   r   dim
placements)r9   distributed_c10d_get_pg_default_devicetyperangeget_world_sizesizer   r   listr   r   str)r.   r=   r6   s   ` @r+   _create_colwise_specrF   A   s     *AA"EEJN	z
 
 
 
T02233
 
 




 
 
 
 
RWWYY''
 
 

 U>3#678*EE   r-   valc                 &   t          |           t          u rt          |                                           dk    rdS t          |                                 d         j                  t          u rdS t          |                                 d         j                  t
          u rt          d          n[t          |           t
          u rEt          | j                  t
          u st          | j                  t          u rt          d          dS )Nr   FTz1Cannot handle DTensor nested inside ShardedTensorzCannot handle nested DTensor)r@   r   lenlocal_shardstensorr    
ValueError_local_tensor)rG   s    r+   _is_nested_tensorrN   U   s    CyyM!!s!!""a''5  ""1%,-->>4  ""1%,--88PQQQ 9	cg		S7**d33D.E.E.V.V78885r-   propsrC   c                 F   |dk    r:t          t          j        t          |                                                    }n4t          j        |t          |                                                    }t          j        || j        | j        | j        | j	        |          S )Nr'   )rC   dtypelayoutrequires_grad
pin_memorydevice)
r   torchrU   r   current_deviceemptyrQ   rR   rS   rT   )rO   rC   r$   rU   s       r+   _alloc_tensorrY   d   s     eel$6{$C$C$R$R$T$TUU+K88GGII
 
 ;k|)#   r-   
state_dictc                    i }d}|                                  D ]\  }}d|                                f||<   t          |          rt          |                                          dk    s
J d            t          |t                    s
J d            |                                d         }|j        j        |j        j	        f||<   |j
        j        }||fS )a+  
    Load the right TP slice of the optimizer state.

    This is not easy since the per-tensor slicing can't be inferred from checkpoint metadata.
    We take advantage of the model state_dict producing a sliced ST to figure out what we need to load.
    This is pretty fragile and it might be easier for FSDP to compute this info for us.
    Returns a dictionary where keys are the same of the state_dict and the value is a tuple of
    (offset, size) for the current rank TP slice.
    N.B. The state_dict *MUST* come from FSDP.sharded_state_dict.
    N   z%Cannot handle ST with multiple shardsz$Can only handle nested ShardedTensorr   )itemsrC   rN   rI   rJ   
isinstancer   metadatashard_offsetsshard_sizesrK   _process_group)rZ   specsdp_pgkeyvalueshards         r+   _get_state_dict_2d_layoutrh   x   s     #%E)-E &&(( 0 0
UEJJLL)c
U## 	0u))++,,1117 211 e]33  6   &&((+E,*E#J L/E 	 r-   c                        e Zd ZU eeef         ed<   eed<   eed<   deee	e
         f         ddf fdZdefdZd	edej        f fd
Z xZS )_ReaderWithOffsettranslationrZ   r_   fqn_to_offsetr%   Nc                     t                                                       || _        t          i           | _        i | _        i | _        d S N)super__init__rl   r   r_   rZ   rk   )selfrl   	__class__s     r+   rp   z_ReaderWithOffset.__init__   sC    * r-   c           	         g }i | _         | j                                        D ]\  }}| j        j        |         }t          |t                    s|t          |||          z  }B|| j        vr|t          |||          z  }`| j        |         }t          |
                                          dk    sJ |
                                d         }t          t          j        t          |j        j        |                    t          j        |j        j                            g}t#          |t%          t&          |          |          }|D ]f}	|	j        j        J t-          |	j        j        |          }
t/          j        |	j        t          j        |
                    }|| j         |	j        <   g||z  }t3          |          S )Nr\   r   )offsetssizes)offset)rk   rZ   r]   r_   state_dict_metadatar^   r   r   rl   rI   rJ   r   rV   Sizer   r`   ra   r   r   r   
dest_indexrv   r   dataclassesreplacer   )rq   requestsfqnobjmdrv   original_shardlocal_chunksreqsrioriginal_offsetoriginal_indexs               r+   create_local_planz#_ReaderWithOffset.create_local_plan   s   --// $	 $	HC237Bc=11 .sB<<<$,,,.sB<<<',Fs''))**a//// --//2N$!J).*A*OQWXX   *^%<%HII	  L 4T/44l D
  A A}+777"3BM4H&"Q"Q!,!4M%*_*E*E" " " 3A //HH!!!r-   indexc                 x    t                                          | j                            ||                    S rn   )ro   lookup_tensorrk   get)rq   r   rr   s     r+   r   z_ReaderWithOffset.lookup_tensor   s.    ww$$T%5%9%9%%G%GHHHr-   )__name__
__module____qualname__dictr   __annotations__r   r   rE   r   intrp   r   r   rV   Tensorr   __classcell__)rr   s   @r+   rj   rj      s         m]23333d3+=&> 4      ("8 (" (" (" ("TI= IU\ I I I I I I I I I Ir-   rj   model_state_dictoptimizer_keystorage_readerplannerc                 R   |                                 }t          |           \  }}t          j                            |          j        }t          |          }|wg }	t          t          j                              D ]B}
t          ||
|
                                z            }|	                    d|
 d|            Ct          d|	          }nt          |          }i }i }|j                                        D ]n\  }}|j        |         }|d         |k    r t#          |t$                    rd||<   ;|j                                        dk    rt+          |j        |j        |          ||<   w|qt/          t+          |j        |j        |          t          j                    t          j                    |
                                t3                                ||<   |d	         }|                    |d|j        f          d         }t7          |j        j        |j        j        |j        j        |j        j        |j        j         
          }|!                    tE          j#        |          |          }g }t          j        |          }|j$        D ]p}tK          tL          |j'                  (                                |k    r3|                    tS          t+          |j        |j*        |          |                     qtW          j,        |||          }||v r=||         d         /tK          tZ          t\                   ||         d                   ||<   |||<   pt_          |||ta          |          n|           tc          ||j                  }|S )a  
    Load a state_dict in conjunction with FSDP sharded optimizer state.

    This is the current recommended way to checkpoint FSDP.
    >>> # xdoctest: +SKIP
    >>> import torch.distributed.checkpoint as dist_cp
    >>> # Save
    >>> model: torch.nn.Model
    >>> optim_params = model.parameters()
    >>> optim = torch.optim.SGD(optim_params, lr=0.01)
    >>> # Save
    >>> with FSDP.state_dict_type(model, StateDictType.SHARDED_STATE_DICT):
    >>>     state_dict = {
    >>>         "optimizer": FSDP.optim_state_dict(model, optim),
    >>>         "model": model.state_dict()
    >>>     }
    >>>     dist_cp.save_state_dict(
    >>>         state_dict=optim_state,
    >>>         storage_writer=dist_cp.FileSystemWriter("checkpoint"),
    >>>         planner=dist_cp.DefaultSavePlanner(),
    >>>     )
    >>>
    >>> # Load
    >>> with FSDP.state_dict_type(model_tp, StateDictType.SHARDED_STATE_DICT):
    >>>     model_state_dict = model_tp.state_dict()
    >>>     checkpoint = {
    >>>         "model": model_state_dict
    >>>     }
    >>>     dist_cp.load_state_dict(
    >>>         state_dict=checkpoint,
    >>>         storage_reader=dist_cp.FileSystemReader(checkpoint_file),
    >>>         planner=dist_cp.DefaultLoadPlanner(),
    >>>     )
    >>>     model.load_state_dict(checkpoint["model_state"])
    >>>
    >>>     optim_state = dist_cp.load_sharded_optimizer_state_dict(
    >>>         model_state_dict,
    >>>         optimizer_key="optimizer",
    >>>         storage_reader=dist_cp.FileSystemReader("checkpoint"),
    >>>     )
    >>>
    >>>     flattened_osd = FSDP.optim_state_dict_to_load(
    >>>        model, optim, optim_state["optimizer"]
    >>>     )
    >>>
    >>>     optim.load_state_dict(flattened_osd)
    Nr2   r3   r   r;   z
<bytes_io>r\   )rank
world_sizenum_devices_per_noder.      )rQ   rR   rS   memory_formatrT   )rK   r_   )process_group)rZ   r   r   )2read_metadatarh   r9   r>   r?   r@   r   rA   rB   r   r)   appendr   rF   rw   r]   planner_datar^   r   rC   numelrY   
propertiesr   get_rankr   r   ShardTensorPropertiesrQ   rR   rS   r   rT   build_metadatarV   rx   shards_metadatar   r   	placementr   r
   ra   r   +_init_from_local_shards_and_global_metadatar   r   r   rj   r   )r   r   r   r   r_   layout_specsrd   dp_pg_device_typer*   r=   idevice_infosharding_specrZ   rl   re   rf   key_pathspec_key
alloc_sizer   st_mdrJ   current_rankshard_mdsts                             r+   r!   r!      s   j ++--H34DEEL%-DDUKKP&'899M}
t*,,-- 	9 	9A0!1}'A'A'C'C#C K 7a77+778888)aJGGG,U33 #%J.0M288:: 8! 8!
U(-A;-''e122 	*JsO :""+ %*.? JsOO ]:e.
<MNN]__.00%2%?%?%A%A%''  JsOO  {H%))(T5:4FGGJJ.&,'.#.<#.< +6  J "00J1G1GTTEL=//L!1 
 
(:;;@@BBlRR##,!,h.BDU    "*	      Je5  B <''L,B1,E,Q%)(3-h9OPQ9R%S%Sc" JsOO %494E!-0007	    &j(2GHHJr-   )r"   rn   )Grz   collections.abcr   typingr   r   r   rV   torch.distributeddistributedr9   torch._utilsr   +torch.distributed._shard.sharded_tensor.apir   0torch.distributed._shard.sharded_tensor.metadatar	   r   -torch.distributed._shard.sharded_tensor.shardr
   :torch.distributed._shard.sharding_spec.chunk_sharding_specr   )torch.distributed.checkpoint._nested_dictr   ,torch.distributed.checkpoint.default_plannerr   %torch.distributed.checkpoint.metadatar   r   r   r   r   r   $torch.distributed.checkpoint.plannerr   r   ,torch.distributed.checkpoint.planner_helpersr   r   .torch.distributed.checkpoint.state_dict_loaderr   $torch.distributed.checkpoint.storager   "torch.distributed.checkpoint.utilsr   r   r   "torch.distributed.distributed_c10dr   #torch.distributed.fsdp._shard_utilsr   torch.distributed.remote_devicer   torch.distributed.tensorr    r   rE   tupler   STATE_DICT_2D_LAYOUT__all__r,   ProcessGrouprF   r   boolrN   rY   rh   rj   r!    r-   r+   <module>r      s       $ $ $ $ $ $ ( ( ( ( ( ( ( ( ( (              + + + + + + E E E E E E      @ ? ? ? ? ? X X X X X X J J J J J J K K K K K K                  G F F F F F F F        K J J J J J > > > > > >         
 B A A A A A L L L L L L : : : : : : , , , , , , Cx'>'M!NNO 
 (
 # C S     '+ "#   (5< D      FL #+C=?B
\   (""
$*;!<<=" " " "J7I 7I 7I 7I 7I* 7I 7I 7I| &*	N N%NN "N k"	N
 N N N N N Nr-   