
    sg]              
          d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZmZ d dlmZ d dlmZmZ d dlmZ  G d	 d
      Z G d d      Z G d de      Z	 d Z dededee   ddfdZ!dee   ddfdZ"dee   ddfdZ#dee   ddfdZ$dee   deee%f   fdZ&dee   dee%ef   fdZ'dee   dee   deeeee   f   eee%f   ee   f   fdZ(dee   dee   fdZ)d Z* G d d      Z+y)     N)deque)DictListSet
NamedTupleTupleDeque)get_size_of_all_nodes)	PartitionDevicePartitionerConfig get_partition_to_latency_mapping get_latency_of_partitioned_graphNodeLatencyget_extra_size_ofPartitionMode)GraphModule)Nodemap_arg)split_modulec                   N    e Zd ZdZdedee   dee   dee   deddfd	Zdefd
Z	y)DAGNodezDAGNode class maintains useful information for a partition (submodule),
    and its input submodules and output submodules.
    submodule_nodeinput_nodesoutput_nodeslogical_device_ids
size_bytesreturnNc                 J    || _         || _        || _        || _        || _        y N)r   r   r   r   r   )selfr   r   r   r   r   s         `/var/www/html/venv/lib/python3.12/site-packages/torch/fx/experimental/accelerator_partitioner.py__init__zDAGNode.__init__   s+     %3'2(4-?$    c                 ,    t        | j                        S r    )strr   r!   s    r"   __str__zDAGNode.__str__*   s    4&&''r$   )
__name__
__module____qualname____doc__r   r   intr#   r&   r(    r$   r"   r   r      s^    %% $Z% 4j	%
 !I% % 
%( (r$   r   c                   J    e Zd ZdZddZdedee   dee   dee   d	eddfd
Zy)DAGz$DAG class contains all the DAG nodesr   Nc                     g | _         y r    )nodesr'   s    r"   r#   zDAG.__init__1   s	    $&
r$   r   r   r   logical_devicesr   c                 X    t        |||||      }| j                  j                  |       y r    )r   r2   append)r!   r   r   r   r3   r   nodes          r"   create_nodezDAG.create_node4   s-     K

 	

$r$   r   N)	r)   r*   r+   r,   r#   r   r   r-   r7   r.   r$   r"   r0   r0   .   sU    .'   $Z  4j	 
 c    
 r$   r0   c                   &    e Zd ZU dZeed<   eed<   y)PartitionResultz4NameTuple used for returning DAG and a new fx moduledagmodule_with_submodulesN)r)   r*   r+   r,   r0   __annotations__r   r.   r$   r"   r:   r:   B   s    >	H''r$   r:   c                      | D ]	  }g |_          y r    )r   )
partitions	partitions     r"   reset_partition_devicerA   L   s     *	')	$*r$   partition_0partition_1r?   r   c                    t        t        |            }| j                  j                  |j                        |_        |j	                          |j                  |       |j                  |        |j                  |       t        |       y)zGiven a list of partitions and its two partitions,
    combine these two partitions into a new one appending to the partitions
    and remove the previous two partitions from the list of partitions
    N)r   lenr2   unionrecalculate_mem_sizer5   removereorganize_partitions)rB   rC   r?   r@   s       r"   combine_two_partitionsrJ   Q   sq     #j/*I!''--k.?.?@IO""$i k"k"*%
r$   c                 f   | D ]   }t               |_        t               |_        " | D ]  }|j                  D ]v  }|j                  }|D ]c  }| D ]\  }||k7  s	||j                  v s||j                  vs'|j                  j                  |       |j                  j                  |       ^ e x  y)zHGiven a list of partitions, mark parents and children for each partitionN)setchildrenparentsr2   usersadd)r?   r@   r6   rO   nps         r"   set_parents_and_childrenrS   b   s       "	 U	E	"   1	OO 
	1DJJE 1 $ 1AI~!qww,4qww;N!**..q1		i01	1
	11 r$   c                 N    t        |       D ]  \  }}||_         t        |        y)zmGiven a list of partitions, reorganize partition id,
    its parents and its children for each partition
    N)	enumeratepartition_idrS   )r?   ir@   s      r"   rI   rI   z   s/    
 "*- #9!"	#Z(
r$   c                    t               }t               }| D ],  }t        |j                        dk(  s|j                  |       . t               }d}|ru|j	                         }||_        |j                  |       |j                  }|D ]  }||vs|j                  |        |s|j                         }t               }|dz  }|ruy)zJGiven a list of partitions,
    mark the bfs level for each partition
    r      N)rL   rE   rN   rP   pop	bfs_levelrM   copy)r?   current_levelvisitedr@   
next_levellevelrM   childs           r"   get_bfs_level_partitionrb      s     %(EM!eG )	y  !Q&i() "%JE
!%%'	#	I%% 	&EJ&u%	& &OO-MJQJE  r$   c                 X    i }| D ]"  }|j                   D ]  }|j                  ||<    $ |S )z;Given a list of partitions,return node to partition mapping)r2   rV   )r?   node_to_partitionr@   r6   s       r"   get_node_to_partition_mappingre      sC    )+ =	OO 	=D&/&<&<d#	== r$   devicesc                 6    i }| D ]  }|||j                   <    |S )z6Get a mapping from device logical ID to Device object.)
logical_id)rf   logical_id_to_deviceds      r"   get_logical_id_to_devicerk      s,    .0 /-.Q\\*/r$   c                 6   t        |      }i }i }|D ]  }g ||<   |j                  ||<    g }| D ]d  }|j                  g k7  rB|j                  D ]2  }||   }	||	   j                  |       ||	xx   |j                  z  cc<   4 T|j                  |       f |||fS )zGiven a list of partitions and a list of devices, returns:
    1. A mapping from device to partitions on it;
    2. A mapping from device to its remaining memory size;
    3. A list of partitions that do not have a device.
    )rk   available_mem_bytesr   r5   used_mem_bytes)
r?   rf   ri   device_to_partitionsdevice_to_left_mem_bytesrj   no_device_partitionsr@   rh   devices
             r"   get_device_partition_statsrs      s     4G<:<24 <"$Q&'&;&; #<  3	''2-':: M
-j9$V,33I>(0I4L4LL0M
 !''	23 	  r$   c           	         dt         dt        t            fddt         ffd}t        | |      \  }d}|D ]F  }t        t	        j                         t        j                  d                   ||      }|rE |S  |S )z\Given a list of partitions and a list of devices,
    map each partition into a device.
    r@   r?   c                    t               }|D ]  }|j                  |j                        } t        |      dk(  r| j                  S |j                  | j                        }d}| j                  D ]  }|t        ||      z  } |S )Nr   )rL   rF   r2   rE   rn   r   )r@   r?   	all_nodesrR   extra_size_neededr6   s         r"   $calculate_extra_mem_bytes_needed_forzNget_device_to_partitions_mapping.<locals>.calculate_extra_mem_bytes_needed_for   s      #u	 	1A!0I	1y>Q+++OOIOO4	OO 	DD!24!CC	D  r$   c                     D ]^  } | |         }||   k  s|   j                  |        | j                  j                  |j                         |xx   |z  cc<    y y)a3  Given a partition, find a logical device for the partition
        The algorithm is to put the partition on the device
        that has just enough mem left for that partition.
        device_to_left_mem_bytes is a dictionary between device and its left mem size
        sorted by its left mem size
        TF)r5   r   rh   )r@   rj   rw   rx   rp   ro   s      r"   find_device_forz9get_device_to_partitions_mapping.<locals>.find_device_for   s~     * 	A D/2! !#;A#>>$Q'..y9,,33ALLA(+/@@+	 r$   TrY   key)r   r   rs   dictsorteditemsoperator
itemgetter)	r?   rf   rz   rq   found_devicer@   rx   rp   ro   s	         @@@r"    get_device_to_partitions_mappingr      s    !!*.y/!9 , 	#:w7	  L) 	#'/G/M/M/OU]UhUhijUk(l#m &y1
 r$   c                     | h}t        | g      }|rR|j                         }|j                  D ]0  }|| k(  r y||vs|j                  |       |j	                  |       2 |rRy)z^Given a partition,check if there is a circular dependency on
    this partition using bfs
    TF)r   popleftrM   rP   r5   )r@   r^   queuerR   ra   s        r"   check_dependencyr     sm      )kG#YK0E
MMOZZ 	(E	!'KK&LL'	(  r$   c                       e Zd ZdZddZdedej                  j                  de	de
fdZ	 dd	eddfd
ZddZddZdefdZdedefdZdefdZd ZdeddfdZdedeeef   ddfdZdedeeef   ddfdZd Zy)Partitionera  A fx module may not fit into one device.
    Partitioner class helps partition one fx module into submodules (partitions),
    so that the submodules can be executed crossing different accelerators.
    The main function of this class is self.partition_graph.
    It partitions the fx module based on the scheme specified in partition_config
    A DAG structure is returned
    along with a new fx module with submodule nodes.
    r   Nc                 .    g | _         i | _        g | _        y r    )r?   rd   rf   r'   s    r"   r#   zPartitioner.__init__*  s    +-24%'r$   	fx_moduletorch_modulepartitioner_configc                    || _         || _        |j                  | _        t        | j                        dk(  rt	        d      t        | j                          | j                   j                  j                  }t        d |D              rt	        d      d}|D ],  }|j                  dk(  r n||j                  j                  z  }. t        | j                  d       }|j                  t        j                  k(  r(| j!                  |j"                  |j$                         na||j&                  k  r| j)                  ||j*                         n3|t-        d	 | j                  D              kD  rt	        d
      |j                  t        j.                  k(  rT| j                  d   j&                  t        fd| j                  D              st	        d      | j1                         n|j                  t        j2                  k(  r'| j5                  |j6                  |j8                         nT|j                  t        j:                  k(  r'| j=                  |j6                  |j8                         n| j?                          |j@                  r| jA                          | jC                         }| jE                  |      }	tG        |	|      }
|
S )zGiven the fx module, torch module and partitioner_config,
        find the partitions, do the partitions,
        and then return a DAG and a new fx module with submodule nodes (partitions)
        r   z
No devicesc              3   8   K   | ]  }|j                   d v   yw)>   outputget_attrplaceholderN)op).0r6   s     r"   	<genexpr>z.Partitioner.partition_graph.<locals>.<genexpr>B  s     RDtww??Rs   z.No Partition since no operations in the moduler   c                     | j                   S r    rm   rj   s    r"   <lambda>z-Partitioner.partition_graph.<locals>.<lambda>K  s    a>S>S r$   r{   )logical_device_idc              3   4   K   | ]  }|j                     y wr    r   )r   rj   s     r"   r   z.Partitioner.partition_graph.<locals>.<genexpr>W  s     &Sq'<'<&Ss   z,Devices have no enough memory for the modulec              3   <   K   | ]  }|j                   k(    y wr    r   )r   rr   rm   s     r"   r   z.Partitioner.partition_graph.<locals>.<genexpr>]  s%       ..2EEs   z'All devices must have same memory size!)$graph_moduler   rf   rE   RuntimeErrorr
   graphr2   allr   r   
total_sizemaxmoder   	aot_basedaot_based_partitionnode_to_partition_mapping#partition_to_logical_device_mappingrm   find_single_partitionrh   sum	sparse_nnsparse_nn_partition
cost_awarecost_aware_partitiontransfer_rate_bytes_per_secnode_to_latency_mappingkl_basedkl_based_partitionsize_based_partitionsaturate_hostdo_partitiondump_dagr:   )r!   r   r   r   r2   total_size_of_graphr6   device_with_max_memr<   r;   retrm   s              @r"   partition_graphzPartitioner.partition_graph/  sx    &()11t||!|,,d//0!!''--RERROPP 	>Dww("4??#=#==	>
 "$,,4ST""m&=&==$$"<<"FF
 !$7$K$KK&&#7J7U7U '  !3&Sdll&S#SSMNN "&&-*A*AA&*ll1o&I&I# "&,,  ''PQQ (()<=#((M,D,DD))&BB&>>
 $((M,B,BB''&BB&>>
 ))+ ++  "&!2!2!4 mm23c#9:
r$   r   c                    | j                         }| j                  j                  j                  D ]-  }|j                  dk(  r|j                  j                  |       / ||_        |g|_        t        | j                        | _
        y)z'Fit the whole fx module into one devicer   N)create_partitionr   r   r2   r   rP   rn   r   re   r?   rd   )r!   r   r   rB   r6   s        r"   r   z!Partitioner.find_single_partition  s     ++-%%++11 	(Dww(" !!$'	( &9"*;)<&!>t!Or$   c                 |   	 dt         f	 fd}i }g 	 j                         } j                  j                  j                  D ]  }|j
                  dv st         j                        t         j                        k  rJt        ||j                        }|j                  dk(  rN ||      }	j                  |       |j                  ||<   |j                  j                  |j                         n||   |k  rt         j                        t         j                        k(  r" j                  dd } j                  |        ||      } j                         }t        ||j                        }|j                  ||<   |j                  j                  |j                         |j!                  |       ||xx   |z  cc<    j                  |        t#         j                         t%         j                         _        t)         j                   j                        }|st+        d      y)a  This method is to partition the fx module based on memory size.
        It uses greedy approach. The result may not be the best.
        The basic idea is:
        Step 1:
        Find a device which has enough memory to fit the current node, create a empty partition
        with the size of that device.
        Then keep adding the following nodes into the partition until the partition is full.
        Step 2:
        Repeat Step 1 until no device left
        Step 3:
        If some nodes are left, create a partition for each left node (single node partition).
        and then try to map those partitions into logical devices with enough mem left.
        r   c                 
   t        | t                     }t        ddd      }j                  D ]  }|vs|j                  |k\  s|} n |j                  dk  rt        t        |       dz         j                  |       |S )ziGiven a node, this function is to find a logical device
            that could fit the node.
             r   zis too large to fit any device)r   rL   r   rf   rm   r   r&   r5   )r6   mem_size_neededrr   rj   occupied_devicesr!   s       r"   find_device_based_on_sizezCPartitioner.size_based_partition.<locals>.find_device_based_on_size  s     0ce<OBB'F\\ ----@F ))A-"3t9/O#OPP##F+Mr$   >   call_methodcall_modulecall_functionr   Nz6Cannot Get a Valid Partition to Logical Device Mapping)r   r   r   r   r2   r   rE   r?   rf   r   rn   r5   rm   r   rh   create_single_node_partitionadd_noderI   re   rd   r   r   )
r!   r   partition_to_left_mem_bytesr@   r6   total_size_of_input_nodesrr   non_single_node_partitions!found_partition_to_device_mappingr   s
   `        @r"   r   z Partitioner.size_based_partition  s   	v 	& =?#)+))+	%%++11 .	<DwwIIt'3t||+<<0A$	0X- //14!:4!@(//7 #66 4% "44;;F<M<MN
 8	B78  #4??3s4<<7HH >B__Q=O : $ A A$ G ( &?t%DF(,(=(=(?I8I $ioo95
 !' : : 8 ) &88??@Q@QR&&t,/	:>WW: 55d;].	<^ 	doo.!>t!O,LOOT\\-
) 1WXXr$   c                    t        | j                  | j                        \  }}}t        |      dk(  sJ dt        |              | j                  D cg c]  }t        ||         dkD  s| }}i }t        |      dz  t        |      z   t        | j                        k  rd}| j                  D cg c]  }||vr||vr| }}i }	|D ]f  }
|D cg c]#  }|j                  |
j                  ||
   z
  k\  r|% }}t        |      dk(  rd} n&t        |d       }|j                  |       |
|	|<   h |snB|j                  |	       t        |      dz  t        |      z   t        | j                        k  r|j                         D ]6  \  }}|j                  }||   D ]  }|j                  j                  |        8 | j                  D ]  }t        |j                          yc c}w c c}w c c}w )	a  Saturate host by assigning replicates to unused devices with enough memory.
        It uses a greedy approach to find a next available set of devices to place all split
        partitions: For each used device, it searches for an idle device with minimal memory
        size that can hold all the partition located on that device; If the search is successful
        for all used devices, it then assigns the new devices' logical ID to the corresponding
        partition.
        r   z2Expect no_device_partitions has 0 device, but get    TFc                     | j                   S r    r   r   s    r"   r   z+Partitioner.saturate_host.<locals>.<lambda>$  s    !BWBW r$   r{   N)rs   r?   rf   rE   rm   minrH   updater   rh   r   r5   print)r!   ro   rp   rq   rj   used_devices replicated_device_to_used_devicesuccessidle_devicestemp_replicate_mappingused_deviceavailable_devices
new_devicereplicate_deviceoriginal_devicerh   r@   rR   s                     r"   r   zPartitioner.saturate_host  s_    'tE		
 $  $%*	\?DX@Y?Z[	\* $(<<Ta37KA7N3ORS3STTAC(,!#c*J&KKsLLP
 
 G L(Q6V-V L  &("  , A *%,,"66.{;<< %! % ()Q.#G !28WX
##J/5@&z2A  ,334JKC ,!#c*J&KKsLLP
 
N .335	@ 
)44J1/B @	,,33J?@	@  	(A!&&'	(_ U%s   G(+G(3G-(G2c                 P     t         j                   j                   fd      }|S )z9Return a new fx module with submodule nodes (partitions).c                 "    j                   |    S r    )rd   )r6   r!   s    r"   r   z*Partitioner.do_partition.<locals>.<lambda><  s    //5 r$   )r   r   r   )r!   r<   s   ` r"   r   zPartitioner.do_partition7  s+    !-5"

 &%r$   r<   c                    t               }|j                  j                  D ]-  }|j                  dk(  r |S |j                  dv r%|j                  t
        j                  k(  rCi }t        |j                  |j                         t        |j                  |j                         t        |j                        dkD  rt        |j                        }n|g}t        |j                  j!                  dd      d         }| j"                  |   j$                  }| j"                  |   j&                  }|j)                  |t        |      |||       0 |S )z?Return the dag structure and the new fx module with submodules.r   >   r   r   rY   _r   )r0   r   r2   r   targetr   __getitem__r   args
setdefaultkwargsrE   rO   listr-   namersplitr?   r   rn   r7   )	r!   r<   r;   r6   r   r   rV   
device_idsr   s	            r"   r   zPartitioner.dump_dag@  s    e*0066 	Dww(", 
+ ww55{{h222,.KDII{556DKK!7!78
 4::"#DJJ/ $vtyy//Q7;<L6IIJ6EEJOOd;'z:+	0 
r$   c                 |    t        | j                        }t        |      }| j                  j                  |       |S )z4Create a partition and append it to self.partitions.)rE   r?   r   r5   )r!   rV   r@   s      r"   r   zPartitioner.create_partition]  s2    4??+l+	y)r$   c                 F    | j                         }|j                  |       y)z$Create a partition for a single nodeN)r   r   )r!   r6   r@   s      r"   r   z(Partitioner.create_single_node_partitiond  s!    ))+	4 r$   rm   c                     dt         t           dt        ddf fd}d dt         t           dt        dt         t           dt        t        t         t           f   f fdd fd		}d
t
        dt        f fd}g g d j                         } j                  j                  j                  D ]  }|j                  dv s ||      k7  r|j                  dk7  r ||      } t        ||j                        }||j                  z   kD  r; ||      }t        ||j                        }|kD  rt        |j                  dz         |j                  |         ||d       t!         j"                          |        |       d}D ]  }||j                  z  } t%              t%         j&                        kD  rGdt)        t%                    z   dz   t)        t%         j&                              z   dz   }	t        |	      g }
t+              D ]  \  }}||j                  z   kD  r$t        dt)        |j,                        z   dz          j&                  |   j.                  g|_        |
j3                   j&                  |   j.                          D ]	  }|
|_         t5         j"                         _        y)a7  This method partition a sparse nn module.
        It is size based partition but different from size_based_partition,
        it only works when all the devices have same memory size (available_mem_bytes).
        In the future, devices with different mem sizes will be supported like size_based_partition.
        It first traverse all the nodes and do the partitions based on the same memory size.
        If the current partition has no enough memory left for a new op node
        (call_module, call_method, call_function), a new partition is created.
        When crossing the boundary between non-embedding nodes and embedding nodes,
        a new partition is created regardlessly.
        For example, if the current node is a non-embedding node but the next node is an
        embedding node, a new partition is created for the next node.
        After the partition, the partitions are combined as much as possible.
        The rule is that a non-embedding partition only
        combines with another non-embedding one.
        So as the embedding partitions.
        r?   rm   r   Nc                 t    d}|r3t        | d       }t        j                          |||       \  }} |r3y)a  Combining small partitions together to keep as less partitions as possible.
            Here is an example of the algorithm to do this:
            Assume some partitions, we first sort them based on partition used memory size.
            [(partition_4, 1), (partition_3, 1), (partition_2, 2), (partition_1, 7), (partition_0, 9)]
            The available memory is 10.
            step 1: self.find_partition_to_combine_based_on_size()
            First, mark bfs level for each partition
            Second, look the smallest partition, partition_4: 10 - 1 = 9
            It means any partition has a used memory equal or less than 9 could combine this partition
            We go from the largest and selection partition_0.
            Check the bfs level for two partitions, if the level difference is less than 2,
            it can be combined.
            step 2: repeat step 1 until no partitions can be combined
            Tc                     | j                   S r    )rn   )rR   s    r"   r   z[Partitioner.sparse_nn_partition.<locals>.combine_partitions_based_on_size.<locals>.<lambda>  s    QEUEU r$   r{   N)r~   rb   r?   )r?   rm   find_combinationsorted_partitions'find_partition_to_combine_based_on_sizer!   s       r"    combine_partitions_based_on_sizezIPartitioner.sparse_nn_partition.<locals>.combine_partitions_based_on_size|  sJ    "  $"$*:;U$V!'8/V%':J0, * # r$   c                     | j                   j                  |j                         }d}|D ]  }|t        ||      z  } |S )zuGiven two partitions, calculate how many mem bytes
            are needed if two partitions are combined
            r   )r2   rF   r   )p1p2r2   mem_bytes_neededr6   s        r"   calculate_mem_bytes_neededzCPartitioner.sparse_nn_partition.<locals>.calculate_mem_bytes_needed  sJ     HHNN288,E  C $5dE$BB C##r$   r   c                 p   d}| j                  d      }| ddd   D ]  }t        |j                  |j                  z
        dk  s) ||      }||k  s8t        ||j                         |j                  |       |j                  |       |j                  j                  d          d} ||fS  ||fS )z+step 1 in combine_partition_based_on_size()Fr   Nr   rY   T)rZ   absr[   rJ   r?   rH   r5   )	r   rm   r?   r   smallest_partitionrR   r   r   r!   s	          r"   r   zPPartitioner.sparse_nn_partition.<locals>.find_partition_to_combine_based_on_size  s      %!2!6!6q!9&tt, 
)33akkABaG'A!EW'X$'+>>.q2DdooV"))*<="))!,"))$//"*=>+/(#Z//
 $Z//r$   c                     rj                  |        nj                  |        |rj                         } | _        | S y)zyIf crossing the boundary between non-embedding nodes and
            embedding nodes, create a new partition
            N)r5   r   left_mem_bytes)r@   new_partitionrm   embedding_partitionsin_embedding_regionnon_embedding_partitionsr!   s     r"   reset_partition_in_sparse_nnzEPartitioner.sparse_nn_partition.<locals>.reset_partition_in_sparse_nn  sF     #$++I6(//	: 113	+>	(  r$   r6   c                     | j                   dk(  rmj                  }t        | j                        j	                  d      D ]:  }t        ||      st        d| d|       t        ||      }dt        |      v s: y y)z$Check if a node is an embedding noder   .zModule z has no attribute 	EmbeddingTF)r   r   r&   r   splithasattrr   getattr)r6   	submoduleatomr!   s      r"   is_embedding_nodez:Partitioner.sparse_nn_partition.<locals>.is_embedding_node  s    ww-' --	,2237 $D"9d3*%i[0B4&I  !(	4 8I"c)n4#$ r$   F>   r   r   r   r   z!is too large to fit into a device)r   zNeed z devices, but only z	 provided
partition_zN(embedding partition) and non embedding partitions can not fit into one device)T)r   r   r-   r   boolr   r   r   r   r2   r   rn   r   r   r   r   rS   r?   rE   rf   r&   rU   rV   rh   r   r5   re   rd   )r!   rm   r   r   r	  r@   r6   r   &total_size_of_non_embedding_partitionsmsgr   rW   r   r   r   r   r   s   ``          @@@@@r"   r   zPartitioner.sparse_nn_partitionj  s   $	Y	>A		8	$	0#I	0!$	0 Y	0 4i()		0*	 		D 	T 	 1346 $)))+	%%++11 	)DwwII$T*.AA !//14$@$K	.A*A',=dIOO,T)-	0H0HH)* !=Y GI0A$	0X-03FF* KK*MM  ""4(+	), 	%YeD 1()ACVW()=?RS12.1 	OI2i6N6NN2	O #$s4<<'88c./01'( c$,,'() 	  s##%&:; 	DLAy 79Q9QQ%& # )0012fg  15Q0J0J/K	, ''Q(B(BC	D  2 	<I+;I(	< "?t!Or$   r   r   c                     dt         f fddt        f fd} j                  j                  j                  D ]"  }|j
                  dvs j                  |       $ t         j                         t         j                         d}|r |      }|rt         j                         t         j                         _        y)aG  This method is to partition the fx module based on the cost.
        The cost is the total latency of running the whole fx module.
        In partitioner_utils.py, the cost model is built.
        The cost aware partition algorithm is:
        #1. At every beginning, each node is a partition.
            Then we map all the partitions to the devices
            and calculate the cost
        #2. Then try to pre-combine any two of the partitions if the two
            partitions can be combined.
            (the bfs level is less than 2 or two partitions are connected and
            can find partition to device mapping)
            See if any partition pair could reduce the current cost.
            Choose the pair that shows the minimum cost and then combine them
        #3. Repeat #2 until the cost cannot be reduced.
        r   c                    ||    }||   }	 t        |j                  |j                  z
        dk  s||j                  v s||j                  v rot	        |||       t        |d         rt        d      S t        |       t        |	j                        }|st        d      S t        |      }t        ||
      }|S t        d      S )zGiven two partitions and a list of partitions, combine these two partitions
            and see what is the cost of the modified partition list
            rY   r   inf)r   r[   rN   rM   rJ   r   floatrA   r   rf   r   r   )p0_indexp1_indexr?   p0r   found_deivcepartition_to_latency_mappingcostr   r!   r   s           r"   try_combining_partitionszBPartitioner.cost_aware_partition.<locals>.try_combining_partitions/  s     H%BH%B R\\BLL01Q6"**$"++&&r2z:#JrN3 <'&z2?  $ <'/O 70, 80/
 <r$   c           	         t        
j                  |      }t        
j                  ||       }t        
j                        dk(  ryg }t	        t        
j                        dz
        D ]`  }t	        |dz   t        
j                              D ]9  } ||
j                  dd       }||k  r||g}|}t        
j                         ; b t        |      dk7  r;
j                  |d      }
j                  |d      }	t        ||	
j                         t        
j                         t        
j                         t        
j                  
j                         t        |      dk7  S )a  Given transfer rate between partitions and each node's latency,
            find two partitions to combine so the cost of the partitions can
            be reduced.
            The algorithm is :
            1. Go through all the partition pairs and see
            if any pair of partitions can be combined.
            2. Calculate the cost after the combination.
            3. Select the minimum cost and combine its corresponding partition pair.
            rY   FNr   )r   r?   r   rE   rangerI   rJ   rb   rA   r   rf   )r   r   r  r  partition_pairrW   jnew_costr  r   r!   r  s             r"   search_combinationz<Partitioner.cost_aware_partition.<locals>.search_combinationU  sN    ,L!8,( 4,+D
 4??#q((*N3t/!34 ;q1uc$//&:; ;A  81dooa>PQH4'*+Q')$//:;; >"a'__^A%67__^A%67&r2t?#DOO4"4??3,T__dllK~&!++r$   >   r   r   r   TN)r  r  r   r   r2   r   r   rS   r?   rb   rI   re   rd   )r!   r   r   r  r6   r   r  s   ```   @r"   r   z Partitioner.cost_aware_partition  s    *$	  $	 L(	,(	,T %%++11 	8DwwCC11$7	8 	!10  2+-D   	doo.!>t!Or$   c           	      B    d  fdfd} j                          t         j                  |      }t         j                  |      }g }g }g } j                  j
                  j                  D ]"  }	|	j                  dvs|j                  |	       $ |D ]  }
 j                  |
   } j                  |   }t         j                        D ]7  \  }}||k7  s j                  |   } ||
|||      \  }}||k  s0|}|}||g}9 t        |      dk7  s |d   |d   |d   |d          t         j                         t         j                   j                          t         j                         t         j                   j                         y)a  This function is a cost aware partition based
        on Kernighan-Lin algorithm.
        First, the graph is partitioned using size_based_partition.
        Then, each node is swapped with any other node in a different
        partition, and at the same time, the cost is estimated after
        the swapping.
        For example, we have nodes n0, n1, n2, n3 and n4.
        Using size_based_partition, n0 and n1 are in Partition p0.
        n2, n3 and n4 in Partition p1. The current cost is estimated.
        We first tried using n0 to swap with n2 from the other partition.
        Then we see that swapping n0 and n2 shows a lower cost
        than the current cost and it is the minimum among other pairs like
        (n0, None)(This means moving n0 to Partition without swapping other nodes),
        (n0, n3) and (n0, n4). We swap n0 and n2 and set the new cost
        as the current cost.
        Then We repeat this process for all the other nodes until all swapping pairs
        are tried.
        c                     | "|j                  |        |j                  |        |#|j                  |       |j                  |       y y r    )remove_noder   )n0n1r  r   s       r"   
swap_nodesz2Partitioner.kl_based_partition.<locals>.swap_nodes  sA     ~r"B~Br" r$   c                    t        d      } 
| |||       t        	j                         t        |      s{t        |      spt	        	j                         t        	j                  |      }t        	j                  	j                        }|st        d      }nt        	j                  |      } 
|| ||       t        	j                         t	        	j                         t        	j                  	j                         |S )Nr  )	r  rI   r?   r   rA   r   r   rf   r   )r"  r#  r  r   r   transfer_rate_per_secr  r  r   r!   r$  r   s            r"   try_swap_nodesz6Partitioner.kl_based_partition.<locals>.try_swap_nodes  s     <Dr2r2&!$//2$R(3CB3G&t7/OOO%<0,  @OOT\\  $ <D;43D r2r2&!$//2"4??3,T__dllKKr$   c           	          t        |j                        dgz   }t        d      }g }|D ],  }||j                  dv r 
| |||||      }	|	|k  s'| |g}|	}. 	|fS )zzThis function helps to swap one node from partition p0
            with all the nodes in another partition p1
            Nr  >   r   r   )r   r2   r  r   )r6   r  r   r   r&  p1_nodesmin_cost	node_pairr#  r  r'  s             r"   swap_node_to_partitionz>Partitioner.kl_based_partition.<locals>.swap_node_to_partition  s     BHH~.HU|H$&I 
$>bee/J&J%"b"&=?T (?!%r
I#H
$ ?"r$   >   r   r   r   r   rY   N)r   r   r?   r   r   r   r2   r   r5   rd   rU   rE   rI   r   rf   )r!   r   r   r,  r  r  r+  r  op_nodesrQ   r6   r  r  r  r   r   r  new_node_pairr$  r'  s   ``                @@r"   r   zPartitioner.kl_based_partition  s   0		#	@	#. 	!!#'GOO4(
$ 0OO9;V
 !#	*,""((.. 	#Att@@"	#  	PD--d3H*B  )9 2!x'2B.D/3/+Hm  $'$1	*,b2" 9~"aL)A,q0A>RSCT &doo60$,,O9	P: 	doo.($,,Gr$   c                    i }|| _         | j                   D ]n  }| j                   |   }||vr6t        |      }| j                  j                  |       |||<   ||   |_        n|| j                   |      }|j                  |       p y)zqThis function helps to rebuild the partitions given the nodes and its
        corresponding partition id
        N)rd   r   r?   r5   r   r   )r!   r   r   !partition_id_to_partition_mappingr6   rV   r@   s          r"   r   zPartitioner.aot_based_partition   s     CE)!:** 	%D11$7L#DD%l3	&&y1BK1,?/R 0	, >**40	 t$	%r$   r8   )r   )r)   r*   r+   r,   r#   r   torchnnModuler   r:   r   r-   r   r   r   r   r0   r   r   r   r   r   r  r   r   r   r   r   r   r.   r$   r"   r   r      s   (
PP hhooP .	P
 
Pf =>69	"^@C(J&k &{ s :) ns nt n`w%*w "&dK&7!8w 
	wrK%*K "&dK&7!8K 
	KZ%r$   r   ),r   collectionsr   typingr   r   r   r   r   r	   r1  "torch.fx.passes.graph_manipulationr
   'torch.fx.experimental.partitioner_utilsr   r   r   r   r   r   r   r   torch.fx.graph_moduler   torch.fx.noder   r   torch.fx.passes.split_moduler   r   r0   r:   rA   rJ   rS   rI   rb   r-   re   rk   rs   r   r   r   r.   r$   r"   <module>r;     s     < <  D	 	 	 . ' 5( (.   ((j ( F*
)2@DY	"i T 0d9o $ Y D 8d9o $tSy/  d6l  tCK7H  "Y"*.v,"
4Y'($vs{*;T)_LM"J4Y4*.v,4n$W% W%r$   