
    0h                     h    d Z ddlZddlZddlmc mZ d Zd Z	 	 	 	 d
dZ	ddZ
d Z G d d	      Zy)zUtils for running models in a distribution setting.

Mostly from
https://github.com/tensorflow/models/blob/master/official/utils/misc/distribution_utils.py.
    Nc                 \   t         j                  j                  j                  j                  t         j                  j                  j                  j
                  t         j                  j                  j                  j                  d}| |vrt        dj                  |             ||    S )aK  Return a CollectiveCommunication based on all_reduce_alg.

    Args:
      all_reduce_alg: a string specifying which collective communication to
        pick, or None.

    Returns:
      tf.distribute.experimental.CollectiveCommunication object

    Raises:
      ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
    )NringncclzqWhen used with `multi_worker_mirrored`, valid values for all_reduce_alg are [`ring`, `nccl`].  Supplied value: {})	tf
distributeexperimentalCollectiveCommunicationAUTORINGNCCL
ValueErrorformat)all_reduce_alg collective_communication_optionss     c/var/www/html/engine/venv/lib/python3.12/site-packages/tf_keras/src/benchmarks/distribution_util.py_collective_communicationr      s     mm((@@EE**BBGG**BBGG($
 ==GGMvH
 	
 ,N;;    c                     | yt         j                  j                  t         j                  j                  d}| |vrt	        dj                  |             ||    } ||      S )a  Return a CrossDeviceOps based on all_reduce_alg and num_packs.

    Args:
      all_reduce_alg: a string specifying which cross device op to pick, or
        None.
      num_packs: an integer specifying number of packs for the cross device op.

    Returns:
      tf.distribute.CrossDeviceOps object or None.

    Raises:
      ValueError: if `all_reduce_alg` not in [None, "nccl",
        "hierarchical_copy"].
    N)r   hierarchical_copyzqWhen used with `mirrored`, valid values for all_reduce_alg are [`nccl`, `hierarchical_copy`].  Supplied value: {})	num_packs)r   r   NcclAllReduceHierarchicalCopyAllReducer   r   )r   r   mirrored_all_reduce_optionscross_device_ops_classs       r   _mirrored_cross_device_opsr   7   ss     ++]]DD# 88AAGB
 	
 9H!I66r   c                 z   |dk  rt        d      | j                         } | dk(  r |dkD  rt        dj                  |            y| dk(  r3t        j                  j
                  j                  t        |            S | d	k(  rS|dk(  rt        j                  j                  d
      S |dkD  rt        d      t        j                  j                  d      S | dk(  rO|dk(  rd
g}nt        |      D cg c]  }d|z  	 }}t        j                  j                  |t        ||            S t        d|        c c}w )aK  Return a DistributionStrategy for running the model.

    Args:
      distribution_strategy: a string specifying which distribution strategy to
        use. Accepted values are "off", "one_device", "mirrored", and
        "multi_worker_mirrored" -- case insensitive. "off" means not to use
        Distribution Strategy.
      num_gpus: Number of GPUs to run this model.

    Returns:
      tf.distribute.DistibutionStrategy object.
    Raises:
      ValueError: if `distribution_strategy` is "off" or "one_device" and
        `num_gpus` is larger than 1; or `num_gpus` is negative.
    r   z`num_gpus` can not be negative.off   zNWhen {} GPUs are specified, distribution_strategy flag cannot be set to `off`.Nmulti_worker_mirrored)communication
one_devicezdevice:CPU:0z=`OneDeviceStrategy` can not be used for more than one device.zdevice:GPU:0mirroredzdevice:GPU:%d)devicescross_device_opsz$Unrecognized Distribution Strategy: )r   lowerr   r   r   r   MultiWorkerMirroredStrategyr   OneDeviceStrategyrangeMirroredStrategyr   )distribution_strategynum_gpusr   r   r#   is         r   get_distribution_strategyr-   W   s^   * !|:;;1779%a<//5vh/?   77}}))EE3NC F 
 	
 ,q===22>BBa<O  }}..~>>
*q=%&G49(ODq*DGD}}--7	 . 
 	
 
./D.EF  Es   1D8c                    t        j                  t        j                  j	                  dd            }|r?t        |d   j	                  dg             t        |d   j	                  dg             z   }|S | rj| j                  d      }t        |      }|dkD  r|dk  rt        d	      |dk(  rdn|}t        j                  d|id|d
d      t        j                  d<   |S d}|S )zSet multi-worker cluster spec in TF_CONFIG environment variable.

    Args:
      worker_hosts: comma-separated list of worker ip:port pairs.

    Returns:
      Number of workers in the cluster.
    	TF_CONFIGz{}clusterchiefworker,r   r   z2Must specify task_index when number of workers > 1)typeindex)r0   task)	jsonloadsosenvirongetlensplitr   dumps)worker_hosts
task_index	tf_confignum_workersworkerss        r   configure_clusterrD      s     

2::>>+t<=I)I.227B?@3i $$Xr2D
 
& ! 
$$S)'l?zA~D  &*Q

"&**$g.!)J?#


;  r   c                 B    | r| j                         }|S t               }|S N)scopeDummyContextManager)strategystrategy_scopes     r   get_strategy_scoperK      s)    !)  -.r   c                       e Zd Zd Zd Zy)rH   c                      y rF    )selfs    r   	__enter__zDummyContextManager.__enter__       r   c                      y rF   rN   )rO   argss     r   __exit__zDummyContextManager.__exit__   rQ   r   N)__name__
__module____qualname__rP   rT   rN   r   r   rH   rH      s    r   rH   )r"   r   Nr   )N)__doc__r7   r9   tensorflow.compat.v2compatv2r   r   r   r-   rD   rK   rH   rN   r   r   <module>r]      sL     	 ! !<87B %	>BB r   