diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt index 0483a5737b830..3d85b681c90d1 100644 --- a/offload/utils/gpurun/CMakeLists.txt +++ b/offload/utils/gpurun/CMakeLists.txt @@ -1 +1,2 @@ add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun) +add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun-old) diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun index 870bc7a8ccbcd..679269ce5c246 100755 --- a/offload/utils/gpurun/gpurun +++ b/offload/utils/gpurun/gpurun @@ -1,697 +1,374 @@ -#!/bin/bash -# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved. +#!/usr/bin/env python3 # -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies -# of the Software, and to permit persons to whom the Software is furnished to do so, -# subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, -# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -# IN THE SOFTWARE. +# Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved. # -# gpurun: Process launch utility for GPU applications. This is a wrapper -# to execute application binaries including OpenMPI GPU applications. -# See help message below (gpurun -h) for more information. +# gpurun: Application process launch utility for GPUs. +# This utility ensures the process will enable either a single +# GPU or the number specified with -md (multi-device) option. +# It launches the application binary with either the 'taskset' +# or 'numactl' utility so the process only runs on CPU cores +# in the same NUMA domain as the selected GPUs. # -# Usage Examples: -# gpurun true -# mpirun -np 4 gpurun env | grep ROCR_VISIBLE_DEVICES +# This utility sets environment variable ROCR_VISIBLE_DEVICES +# to selected GPUs ONLY if it was not already set by the +# callers environment AND the number of GPUs is not 1. # - -# If set to 1, just invoke the rest of the command line without doing anything -# else. -GPURUN_BYPASS=${GPURUN_BYPASS:-0} - -function execOnError() { - exec "$@" -} - -# PROGVERSION string is updated by cmake when component is installed -PROGVERSION=X.Y-Z -function version(){ - echo $0 version $PROGVERSION - exit 0 -} -function usage(){ -/bin/cat 2>&1 <<"EOF" - - gpurun: Application process launch utility for GPUs. - This utility ensures the process will enable either a single - GPU or the number specified with -md (multi-device) option. - It launches the application binary with either the 'taskset' - or 'numactl' utility so the process only runs on CPU cores - in the same NUMA domain as the selected GPUs. - This utility sets environment variable ROCR_VISIBLE_DEVICES - to selected GPUs ONLY if it was not already set by the - callers environment AND the number of GPUs is not 1. - This utility also sets environment variable HSA_CU_MASK - to control which CUs are available to the process. - HSA_CU_MASK is set only when more than one OpenMPI process - (rank) will utilize the same GPU and it is not preset. - Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the - number of CUs available to the process after masking. - - Usage: - gpurun [ ] - mpirun -np gpurun [ ] - - Options: - -h Print this help message and exit - -md Set number of desired devices for multi-device mode, default=1 - -s suppress output, often useful in benchmarking - -q suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0 - -v Verbose output, same as GPURUN_VERBOSE=1 - -vv Verbose output, same as GPURUN_VERBOSE=2 - -m use numactl membind to CPUs in same NUMA domain. Note: Allocation - fails when not enough memory available on these nodes. - -l use numactl localalloc to CPUs in same NUMA domain. Note: If - memory cannot be allocated, alloc falls back to other nodes. - -nr use numactl ROCR_VISIBLE_DEVICES - -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK - --version Print version of gpurun and exit - - Optional Input environment variables: - GPURUN_VERBOSE - 0: default for silent operation, no trace printed to stderr - 1: -v prints trace record including process launch cmd to stderr - 2: -vv prints trace and other summary diagnostics - ROCMINFO_BINARY Set location of rocminfo binary - AOMP: location of AOMP or ROCM - GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0. - This only works for single device mode. - GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards - GPURUN_MASK_POLICY : useful if machine has different GPU cards - ROCR_VISIBLE_DEVICES: See description above - OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi - OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi - This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID - and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK - - Generated (output) Environment Variables: - OMPX_TARGET_TEAM_PROCS - Number of CUs available to process - ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset - HSA_CU_MASK - The CU mask for the device. - LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument - GPU_MAX_HW_QUEUES - LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" - - Limitations: - - Currently, gpurun creates masks that are mutually exclusive of each other. - That is, the MPI processes will not share CUs. If number of ranks is not - perfectly divisible by number of CUs or number of GPUs, some resources - would be unused. - Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization. - - Works with AOMP 19.0-0 or ROCM 6.1 or greater - - cu masking is not available when multiple devices per process are enabled - with -md option (multi-device) mode. - - Notes: - With MPI, this utility distributes GPUs and their CUs across - multiple ranks of an MPI job into mutually exclusive sets of CUs. - It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE - and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a - the mutually exclusive CU mask. - - An rplace (rank place) is a subset of CUs for a rank. - gpurun calculates the number of rplaces needed to contain all - the specified number of ranks for this node. If number of ranks not - divisible by number of GPUs, then there will be more rplaces than ranks. - The number of CUs in an rplace is calculated by dividing the number of - CUs per GPU by the number of rplaces per GPU. This is also the number of - bits set in the CU mask. This is also the number of physical locations - available for an OpenMP team to execute. This utility exports that number - to the environment variable OMPX_TARGET_TEAM_PROCS. This value - could be used by the application or runtume to adjust the number - of desired teams in a target region. If no masking occurs, the entire - GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to - the total number of CUs on the GPU. - - Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. - -EOF - exit 0 -} - -_end_gpurun_opts=0 -_devices_per_mdset=1 -_uses_multi_device=0 -while [ "$_end_gpurun_opts" == "0" ] ; do - case "$1" in - -s) GPURUN_VERBOSE=0;; - -q) GPURUN_VERBOSE=0;; - --quiet) GPURUN_VERBOSE=0;; - -h) usage ;; - -help) usage ;; - --help) usage ;; - -version) version ;; - --version) version ;; - -v) GPURUN_VERBOSE=1;; - -vv) GPURUN_VERBOSE=2;; - -m) _use_numactl_membind=1;; - -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; - -nr) _use_numactl_rocr=1;; - -nm) _use_numactl_ompi=1;; - -l) _use_numactl_localalloc=1;; - -nomask) GPURUN_MASK_POLICY="nomask";; - *) _end_gpurun_opts=1; break;; - esac - if [ "$_end_gpurun_opts" == "0" ] ; then - shift - fi -done - -if [ "$GPURUN_BYPASS" = "1" ]; then - execOnError "$@" -fi - -# Default: quiet operation -GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} -# Default: create mutually exclusive sets of CUs when GPU is oversubscribed -GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex} -# switch mask policy to preset if HSA_CU_MASK was preset -[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset -# switch mask policy to nomask for multi-device -[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask -# Offset selected device to avoid some heavily used GPUs -GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0} - -# Get environment variables set by OpenMPI -_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE -_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK -# If not OpenMPI, check for Platform MPI, MVAPICH -if [ -z "$_num_local_ranks" ] ; then - _num_local_ranks=$MPI_LOCALNRANKS - _local_rank_num=$MPI_LOCALRANKID -fi -# Also try MPI_COMM_WORLD env vars -if [ -z "$_num_local_ranks" ] ; then - _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE - _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK -fi -# Check if SLURM was used -if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then - _num_local_ranks=$SLURM_CPUS_ON_NODE - _local_rank_num=$SLURM_LOCALID -fi - -if [ "$_use_numactl_rocr" == "1" ] ; then - _cmd_binary=`which numactl` - if [ $? == 0 ] ; then - numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* - exit $? - else - $* - exit $? - fi -fi -if [ "$_use_numactl_ompi" == "1" ] ; then - _cmd_binary=`which numactl` - if [ $? == 0 ] ; then - numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* - exit $? - else - $* - exit $? - fi -fi -# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU -if [ -z "$_num_local_ranks" ] ; then - _num_local_ranks=1 - _local_rank_num=0 -fi - -# Find location of the rocminfo binary -AOMP=${AOMP:-_AOMP_INSTALL_DIR_} -if [ ! -d $AOMP ] ; then - AOMP="_AOMP_INSTALL_DIR_" -fi -if [ ! -d $AOMP ] ; then - AOMP="/opt/rocm/lib/llvm" -fi -if [ ! -d $AOMP ] ; then - AOMP="/opt/rocm/llvm" -fi -if [ ! -d $AOMP ] ; then - realpath=`realpath $0` - thisdir=`dirname $realpath` - AOMP=$thisdir/.. -fi -if [ ! -d $AOMP ] ; then - >&2 echo "ERROR: AOMP not found at $AOMP" - >&2 echo " Please install AOMP or correctly set env-var AOMP" - execOnError "$@" -fi -ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo} -[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo -[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo -if [ ! -f $ROCMINFO_BINARY ] ; then - >&2 echo "ERROR: Could not find binary for rocminfo," - >&2 echo " Please correct installation of ROCM or AOMP compiler" - execOnError "$@" -fi - -# Use rocminfo to find number number of CUs and gfxids for each GPU. -_tfile="/tmp/rinfo_out$$" -$ROCMINFO_BINARY 2>/dev/null | grep -E " Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile -_tfile_lines=`wc -l $_tfile | cut -d" " -f1` -if [ $_tfile_lines == 0 ] ; then - >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices" - rm $_tfile - execOnError "$@" -fi -# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device -_ri_all_gfxids="" -_ri_gfxids=() -_ri_cucount=() -_ri_bdfids=() -_ri_dev_idx=() -_ri_num_devices=0 -_last_cu_count=0 -_ri_uuid=() -_last_device_type_was_gpu=0 -_device_type_preset=0 -_ri_num_all_devices=0 -[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1 -while read _linepair ; do - _fieldvalue=`echo $_linepair | cut -d":" -f2` - _fieldtype=`echo $_linepair | cut -d":" -f1` - if [ $_fieldvalue == "CPU" ] ; then - _last_device_type_was_gpu=0 - elif [ $_fieldvalue == "GPU" ] ; then - _last_device_type_was_gpu=1 - elif [ "$_fieldtype" == "Uuid" ] ; then - _this_uuid=$_fieldvalue - elif [ "$_fieldtype" == "BDFID" ] ; then - if [[ $_last_device_type_was_gpu == 1 ]] ; then - # _domain="$(echo "$_fieldvalue / (2^32)" | bc)" - _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)" - _devfn="$(echo "($_fieldvalue % (2^8))" | bc)" - _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")" - fi - elif [ "$_fieldtype" == "Name" ] ; then - # The device name field is last in rocminfo output, so we can create new _ri_ array entry - if [[ $_last_device_type_was_gpu == 1 ]] ; then - _this_gfxid=`echo $_fieldvalue | cut -d'-' -f5` - ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid" - _is_type_visible=1 - if [ $_device_type_preset == 1 ] ; then - _is_type_visible=0 - if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then - _is_type_visible=1 - fi - fi - if [ $_is_type_visible == 1 ] ; then - _ri_gfxids+=( $_this_gfxid ) - _ri_cucount+=( $_last_cu_count ) - _ri_bdfids+=( $_bdfidstr ) - _ri_dev_idx+=( $_ri_num_all_devices ) - _ri_uuid+=( $_this_uuid ) - _ri_num_devices=$(( $_ri_num_devices + 1 )) - fi - _ri_num_all_devices=$(( $_ri_num_all_devices + 1 )) - fi - else - # else the _fieldvalue was the number of CUs or GCPUs - if [[ $_last_device_type_was_gpu == 1 ]] ; then - _last_cu_count=$_fieldvalue - fi - fi -done < $_tfile -rm $_tfile - -if [ $_ri_num_devices == 0 ] ; then - if [ $_local_rank_num == 0 ] ; then - if [ $_device_type_preset == 1 ] ; then - >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES." - >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" - else - >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY" - fi - if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then - >&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES" - >&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly." - fi - execOnError "$@" - else - execOnError "$@" - fi -fi - -# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per -# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids, -# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information -# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above -# by scanning output from rocminfo. -_sysdevdir="/sys/bus/pci/devices" -_ss_num_devices=0 -_ss_cpulist=() -_ss_bdfid=() -_ss_numanode=() -_ss_uuid=() -_ss_gfxid=() -_ss_cucount=() -for _devid in `ls $_sysdevdir` ; do - if [ -f $_sysdevdir/$_devid/device ] ; then - _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` - if [ ! -z $_driver_name ] ; then - if [ $_driver_name == "DRIVER=amdgpu" ] ; then - _numa_node=`cat $_sysdevdir/$_devid/numa_node` - [ "$_numa_node" == "-1" ] && _numa_node=0 - _this_uuid=0 - if [ -f $_sysdevdir/$_devid/unique_id ] ; then - _this_uuid=`cat $_sysdevdir/$_devid/unique_id` - if [ -z $_this_uuid ] ; then - _this_uuid=0 - _has_unique_id_file=0 - else - _this_uuid="GPU-$_this_uuid" - _has_unique_id_file=1 - fi - fi - _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` - _match_uuid_count=0 - for _ri_i in ${!_ri_bdfids[@]} ; do - _ss_value=$_this_uuid - _ri_value=${_ri_uuid[$_ri_i]} - if [ $_ss_value == $_ri_value ] ; then - _match_uuid_count=$(( $_match_uuid_count + 1 )) - fi - done - # Search _ri_ arrays for matching uuids or matching bdfids. - for _ri_i in ${!_ri_bdfids[@]} ; do - if [ "$_has_unique_id_file" == "1" ] ; then - _ss_value=$_this_uuid - _ri_value=${_ri_uuid[$_ri_i]} - elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then - # Under Hyper-V, we may see a zero BDFID. Fall back to UUID. - _ss_value=$_devid - _ri_value=$_devid - else - _ss_value=$_devid - _ri_value="0000:${_ri_bdfids[$_ri_i]}.0" - fi - if [ $_ss_value == $_ri_value ] ; then - if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then - # Some GPUs do not have unique_id or TPX mode creates multiple - # identical uuids, so use device index for RVD - _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} ) - else - _ss_uuid+=( $_this_uuid ) - fi - _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} ) - _ss_cucount+=( ${_ri_cucount[$_ri_i]} ) - _ss_bdfid+=( $_devid ) - _ss_numanode+=( $_numa_node ) - _ss_cpulist+=( $_this_cpulist ) - _ss_num_devices=$(( $_ss_num_devices + 1 )) - fi - done - fi - fi - fi -done - -if [[ $_ss_num_devices -lt 1 ]] ; then - if [ $_device_type_preset == 1 ] ; then - >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES." - >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" - else - >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir." - fi - execOnError "$@" -fi - -# check for taskset or numactl cmd -if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then - _launch_process_cmd_binary=`which numactl` - if [ $? != 0 ] ; then - >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed." - execOnError "$@" - fi -else - _launch_process_cmd_binary=`which taskset` - if [ $? != 0 ] ; then - >&2 echo "ERROR: $0 requires the taskset command to be installed." - execOnError "$@" - fi -fi -if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then - >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored." - _use_numactl_membind=0 -fi - -_utilized_devices=$_ri_num_devices -[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks - -# Calculate number of GPUs to use to evenly spread ranks across GPUs. -# An rplace is a set of CUs that will be used for a rank. -# The number of rplaces must be at least the number of ranks. -_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices )) -_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices )) -if [ $_uncovered_ranks != 0 ] ; then - # If _num_local_ranks not divisible by number of GPUs, - # then add an extra rplace per GPU to make room for remainder. - _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) -fi -if [ $GPURUN_MASK_POLICY == "mutex" ] ; then - # For mutex policy, adjacent ranks are assigned to the same device. - _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) - # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS - _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) -else - # for mask policies nomask or preset, adjacent ranks are assigned to - # different GPUs and oversubscribed ranks are assigned round robin - _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) -fi - -_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) -if [ $_num_local_ranks -gt $_node_cus ] ; then - >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " - execOnError "$@" -fi - -if [ $_uses_multi_device == 1 ]; then - # Enforce some rules on the use of -md option - # Note -md forces GPURUN_MASK_POLICY=nomask - if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then - >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode" - execOnError "$@" - fi - if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then - >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)" - execOnError "$@" - fi - _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset )) - if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then - printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n Some multi-device sets will overlap.\n" >&2 - fi - _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices)) - _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 )) - - # merge entries for this mdset from per device arrays - _md_bdfs="" - _md_cpus="" - _md_nns="" - _md_uuids="" - _md_dev_idxs="" - _sep="" - for i in `seq $_md_device_set_start $_md_device_set_end` ; do - _dev_index=$i - # handle index wrap around number of devices - [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices )) - _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]} - _new_nn=${_ss_numanode[$_dev_index]} - SAVEIFS=$IFS - IFS="," - _found=0 - for _existing_nn in $_md_nns ; do - [ $_existing_nn == $_new_nn ] && _found=1 - done - IFS=$SAVEIFS - if [ $_found == 0 ] ; then - # only add new numa node and cpulist, if not already in the md set - _md_nns+=$_sep$_new_nn - _md_cpus+=$_sep${_ss_cpulist[$_dev_index]} - fi - _md_uuids+=$_sep${_ss_uuid[$_dev_index]} - _md_dev_idxs+=$_sep$_dev_index - _sep="," - done - _device_num=$_md_device_set_start -fi - -_available_CUs_per_device=${_ss_cucount[$_device_num]} -_gfxid=${_ss_gfxid[$_device_num]} - -_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) -if [ $_num_local_ranks -gt $_node_cus ] ; then - >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " - execOnError "$@" -fi - -_utilized_CUs_per_device=$_available_CUs_per_device -_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) -# Lower utilized CUs till divisible by number of rplaces per GPU -while [ $_rem2 != 0 ] ; do - _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 )) - _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) -done -_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) - -# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0 -if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then - if [ $_uses_multi_device == 0 ] ; then - _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) - _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices )) - _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) - _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) - _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) - _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) - if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then - if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then - _extra_diags=true - fi - fi - >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" - >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" - >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" - [ $_extra_diags ] && \ - >&2 echo "-- USED GPUS: $(( $_ri_num_devices - $_wasted_GPUs ))" - [ $_extra_diags ] && \ - >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " - [ $_extra_diags ] && echo - >&2 echo "- RPLACEs PER NODE: $_total_GPU_rplaces" - >&2 echo "- RPLACEs PER GPU: $_number_of_rplaces_per_GPU" - [ $_extra_diags ] && \ - >&2 echo "-- USED RPLACEs: $_num_local_ranks (RANKS)" - [ $_extra_diags ] && \ - >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ - >&2 echo "- gfxids ${_ss_gfxid[@]}" - >&2 echo "- CUs PER GPU: ${_ss_cucount[@]}" - [ $_extra_diags ] && \ - >&2 echo "-- USED on CUs RANK0: $_utilized_CUs_per_device" - [ $_extra_diags ] && \ - >&2 echo "-- UNUSED CUs RANK0 : $_wasted_CUs_on_each_GPU" - >&2 echo "- CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" - >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" - if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then - >&2 echo "- Preset ROCR_VISIBLE_DEVICES: $ROCR_VISIBLE_DEVICES" - fi - if [[ ! -z "$HSA_CU_MASK" ]] ; then - # node utilizatino could be incorrect with preset cumask. - >&2 echo "- Preset HSA_CU_MASK: $HSA_CU_MASK" - else - >&2 echo "- NODE UTILIZATION: $_utilization %" - fi - else - >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" - >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" - >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" - >&2 echo "- DEVS PER RANK: $_devices_per_mdset" - >&2 echo "- MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)" - _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices )) - >&2 echo "- NODE UTILIZATION: $_md_utilization %" - fi -fi -# --- END OF DIAGNOSTIC BLOCK - -if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then - # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace - _bits_to_set=$_CUs_per_rplace - # This formula keeps adjacent ranks on same GPU which should be preferred - _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) - # use bc because these values can be very large - _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` - _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` - # Calculate the number of leading zeros needed for this mask - _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 )) - for i in `seq 1 $_lz` ; do - _mask="0$_mask" - done - _mask="0x$_mask" -fi - -_launch_process_cmd="" -if [ $_uses_multi_device == 0 ] ; then - # retrieve scanned info from per device arrays - _bdfidstrc=${_ss_bdfid[$_device_num]} - NUMANODE=${_ss_numanode[$_device_num]} - _list_of_cpu_cores=${_ss_cpulist[$_device_num]} - _this_uuid=${_ss_uuid[$_device_num]} -else - # Use multi-device values - _bdfidstrc=$_md_bdfs - NUMANODE=$_md_nns - _list_of_cpu_cores=$_md_cpus - _this_uuid=$_md_uuids - _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset " -fi -if [ "$_use_numactl_localalloc" == "1" ] ; then - _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE" -elif [ "$_use_numactl_membind" == "1" ] ; then - _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE" -else - _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores" -fi - -# If gpurun was not given command to execute, then dont run _launch_process_cmd -[ "$*" == "" ] && _launch_process_cmd="" - -# only set ROCR_VISIBLE_DEVICES if not already set -if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then - export ROCR_VISIBLE_DEVICES=$_this_uuid - _log_word="RVD" -else - _log_word="PRESET-RVD" -fi - -export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace - -# - Limit HSA queues when multiple ranks per GPU -if [ $_number_of_rplaces_per_GPU != 1 ] ; then - # Only set these env controls if not set by caller - [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 - [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 -fi - -[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] && \ - [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" - -if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then - # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution. - if [ "$GPURUN_VERBOSE" != "0" ] ; then - if [ $_uses_multi_device == 1 ] ; then - printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n CMD:$_launch_process_cmd $*\n" >&2 - else - printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2 - fi - fi - $_launch_process_cmd $* - # --- end code block -else - # --- HSA_CU_MASK is required in this code block, assumes no multi-device - if [[ -z "$HSA_CU_MASK" ]] ; then - # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: - export HSA_CU_MASK=0:$_mask - else - # use preset mask - _mask=$HSA_CU_MASK - fi - if [ "$GPURUN_VERBOSE" != "0" ] ; then - printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2 - fi - HSA_CU_MASK=0:$_mask \ - $_launch_process_cmd $* - # --- end code block -fi -exit $? +# Future: +# This utility also sets environment variable HSA_CU_MASK +# to control which CUs are available to the process. +# HSA_CU_MASK is set only when more than one OpenMPI process +# (rank) will utilize the same GPU and it is not preset. +# Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the +# number of CUs available to the process after masking. +# +# $ gpurun -topo +# Topology Numa: 0 PageSize: [always] madvise never +# +# GPU Node Affinity UUID Cores +# 0 0 0 GPU-b256278bf70405e2 0-23,96-119 +# 1 1 1 GPU-a33557394e2c744e 24-47,120-143 +# 2 2 2 GPU-4f78640baf57e5f0 48-71,144-167 +# 3 3 3 GPU-b66921701d196e10 72-95,168-191 + +import subprocess +import re +import os +import sys + +if sys.version_info < (3, 7): + print("require minimum python version 3.7 or later") + sys.exit(0) + +noAmdSmi = False + +try: + from amdsmi import * +except ImportError: + noAmdSmi = True + + +def get_amd_smi_static_numa(): + """ + get the output of 'amd-smi static --numa' to extract GPU affinity + and NUMA node information, storing them in arrays indexed by GPU number. + """ + gpu_affinity = [] + numa_node = [] + hip_uuid = [] + gpu_id = 0 + + amdsmi_init() + + try: + devices = amdsmi_get_processor_handles() + node_number = 0 + affi_node = 0 + if len(devices) == 0: + print("No GPUs on machine") + sys.extit(1) + for device in devices: + info = amdsmi_get_gpu_enumeration_info(device) + node_number = amdsmi_topo_get_numa_node_number(device) + if debug_numa > 2: print("****");print("gpu_id: ", gpu_id);print("Numa: ",node_number) + # Ensure lists are large enough to accommodate the GPU ID + while len(numa_node) <= gpu_id: + numa_node.append(None) + numa_node[gpu_id] = node_number + while len(hip_uuid) <= gpu_id: + hip_uuid.append(None) + hip_uuid[gpu_id] = info['hip_uuid'] + if debug_numa > 2: print("hip_id: ", info['hip_id']); print("hip_uuid: ", info['hip_uuid']) + + try: + affi_node = amdsmi_get_gpu_topo_numa_affinity(device) + if affi_node == -1: affi_node = node_number + if debug_numa > 2: print("Affinity: ", affi_node) + except AmdSmiException as e: + if debug_numa > 2: print("N/A") + + # Ensure lists are large enough to accommodate the GPU ID + while len(gpu_affinity) <= gpu_id: + gpu_affinity.append(None) + gpu_affinity[gpu_id] = affi_node + gpu_id += 1 + except AmdSmiException as e: + printr(f"Error executing amd-smi: {e}") + + return gpu_affinity, numa_node, hip_uuid + +def parse_rocm_smi_toponuma(): + """ + Parses the output of 'rocm-smi --showtoponuma' to extract GPU affinity + and NUMA node information, storing them in arrays indexed by GPU number. + """ + try: + # Execute the rocm-smi command + UIresult = subprocess.run(['rocm-smi', '--showuniqueid'], capture_output=True, text=True, check=True) + UIoutput = UIresult.stdout + except subprocess.CalledProcessError as e: + print(f"Error executing rocm-smi: {e}") + return None, None, None + except FileNotFoundError: + print("Error: 'rocm-smi' command not found. Ensure ROCm is installed and in your PATH.") + return None, None, None + + hip_uuid = [] + patternUI = re.compile(r"GPU\[(\d+)\]\s+:\s+Unique\s+ID:\s+0x([0-9a-fA-F]+)") + for line in UIoutput.splitlines(): + match = patternUI.search(line) + if match: + gpu_id = int(match.group(1)) + UUID = match.group(2) + while len(hip_uuid) <= gpu_id: + hip_uuid.append(None) + hip_uuid[gpu_id] = "GPU-"+UUID + + try: + # Execute the rocm-smi command + result = subprocess.run(['rocm-smi', '--showtoponuma'], capture_output=True, text=True, check=True) + output = result.stdout + except subprocess.CalledProcessError as e: + print(f"Error executing rocm-smi: {e}") + return None, None + except FileNotFoundError: + print("Error: 'rocm-smi' command not found. Ensure ROCm is installed and in your PATH.") + return None, None, None + + gpu_affinity = [] + numa_node = [] + + # Regex to find lines containing GPU information (e.g., "GPU[0-9]: Affinity: [0-9]+, Node: [0-9]+") + patternAffy = re.compile(r"GPU\[(\d+)\]\s+:\s+\(Topology\) Numa Affinity:\s+(\d+)") + patternErrA = re.compile(r"get_numa_affinity_topology, Not supported on the given system") + patternNode = re.compile(r"GPU\[(\d+)\]\s+:\s+\(Topology\) Numa Node:\s+(\d+)") + + for line in output.splitlines(): + match = patternAffy.search(line) + if match: + gpu_id = int(match.group(1)) + affinity = int(match.group(2)) + # Ensure lists are large enough to accommodate the GPU ID + while len(gpu_affinity) <= gpu_id: + gpu_affinity.append(None) + + gpu_affinity[gpu_id] = affinity + match = patternNode.search(line) + if match: + gpu_id = int(match.group(1)) + node = int(match.group(2)) + + # Ensure lists are large enough to accommodate the GPU ID + while len(numa_node) <= gpu_id: + numa_node.append(None) + + numa_node[gpu_id] = node + #cpx tpx etc are missing affinity info, fix it here + match = patternErrA.search(line) + if match: + # Ensure lists are large enough to accommodate the GPU ID + while len(gpu_affinity) <= gpu_id: + gpu_affinity.append(None) + + #use previous gpu_affinity + gpu_affinity[gpu_id] = affinity + return gpu_affinity, numa_node, hip_uuid + + +def parse_lscpu_numa(): +# get lscpu numa info +# NUMA node0 CPU(s): 0-7 +# NUMA node1 CPU(s): 8-15 + try: + # Execute the rocm-smi command + result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True) + output = result.stdout + except subprocess.CalledProcessError as e: + print(f"Error executing lscpu: {e}") + return None + except FileNotFoundError: + print("Error: 'lscpu' command not found.") + return None + + numa_cpus = [] + patternLSCPU = re.compile(r"NUMA node(\d+) CPU\(s\):\s+([\d,-]+)") + + if debug_numa > 2:print("NUMA CPUs:") + for line in output.splitlines(): + match = patternLSCPU.search(line) + if match: + numa_id = int(match.group(1)) + cpus = match.group(2) + if debug_numa > 2:print(" numa cores:", numa_id, cpus) + # Ensure lists are large enough to accommodate the GPU ID + while len(numa_cpus) <= numa_id: + numa_cpus.append(None) + numa_cpus[numa_id] = cpus + return numa_cpus + +def check_numactl_exists(): + try: + subprocess.run(['numactl', '--version'], check=True, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return True + # numactl command not found in PATH + except FileNotFoundError: return False + except subprocess.CalledProcessError: return True + except Exception as e: + print(f"An unexpected error occurred: {e}") + return False + +def check_taskset_exists(): + try: + subprocess.run(['taskset', '--version'], check=True, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return True + # taskset command not found in PATH + except FileNotFoundError: return False + except subprocess.CalledProcessError: return True + except Exception as e: + print(f"An unexpected error occurred: {e}") + return False + +def helpExit(exCode): + if exCode == 1: print("Error: nothing to bind") + print("Usage: gpurun [gpurun_options] Program and options") + print(" -h --help : display help test") + print(" -v : display gpurun command") + print(" -vv : display additional debug info") + print(" -vvv : display more debug info") + print(" -dryrun : do not run bindings") + print(" -taskset : use taskset for binding") + print(" -numatcl : use numactl for binding [default]") + print(" -md : Set number of desired devices for multi-device mode, default=1") + print(" -nr : use numactl ROCR_VISIBLE_DEVICES") + print(" -nm : use numactl OMPI_COMM_WORLD_LOCAL_RANK") + print(" -topo : display the topology and exit") + print(" -rocmsmi : force use of rocm-smi rather than amd-smi") + print(" --version : Print version of gpurun and exit") + print("") + sys.exit(exCode) + # still to do + # -m use numactl membind to CPUs in same NUMA domain. Note: Allocation + # fails when not enough memory available on these nodes. + # -l use numactl localalloc to CPUs in same NUMA domain. Note: If + # memory cannot be allocated, alloc falls back to other nodes. + # support GPU-xxxxxxxx + +def processArgs(): + sysPos=1 + debug_numa=0 + use_taskset=False + use_numactl=True + use_nobind=False + use_nr=False + use_nm=False + use_md=False + use_rocmsmi=False + md_count=1 + + dump_topo=False + dry_run=False + skip_args = ["-s", "-q", "-m", "-l" ] + # loop over bind arguments + while True: + if len(sys.argv[sysPos:]) == 0: helpExit(1) + if sys.argv[sysPos] == "-v": debug_numa=1 + elif sys.argv[sysPos] == "-vv": debug_numa=2 + elif sys.argv[sysPos] == "-vvv": debug_numa=3 + elif sys.argv[sysPos] in ["-h", "--help"]: helpExit(0) + elif sys.argv[sysPos] == "--version": print("Version: 22.0.0"); sys.exit(0) + elif sys.argv[sysPos] == "-dryrun": dry_run=True + elif sys.argv[sysPos] == "-taskset": use_taskset=True; use_numactl=False; use_nobind=False + elif sys.argv[sysPos] == "-numactl": use_numactl=True; use_taskset=False; use_nobind=False + elif sys.argv[sysPos] == "-nobind": use_nobind=True; use_taskset=False; use_numactl=False + elif sys.argv[sysPos] == "-topo": dump_topo=True; break + elif sys.argv[sysPos] == "-nr": use_nr=True + elif sys.argv[sysPos] == "-nm": use_nm=True + elif sys.argv[sysPos] == "-rocmsmi": use_rocmsmi=True + elif sys.argv[sysPos] == "-md": + use_md=True + if sys.argv[sysPos+1].isdigit(): + md_count=int(sys.argv[sysPos+1]) + sysPos += 1 + #to be implimented GPURUN options + elif sys.argv[sysPos] in skip_args: skipped_args=True + else: break + sysPos += 1 + + return sysPos, debug_numa, use_taskset, use_numactl, use_nobind, dry_run, use_md, md_count, use_nr, use_nm, dump_topo, use_rocmsmi + +def dumpTopology(affinity_data, node_data, hip_uuid, numa_cpus): + numaStat="" + pageSize="" + with open('/proc/sys/kernel/numa_balancing', 'r') as f: numaStat = f.read() + with open('/sys/kernel/mm/transparent_hugepage/enabled', 'r') as f: pageSize = f.read() + Tb="\t" + print("Topology Numa: "+numaStat.strip()+" PageSize: "+pageSize.strip()+"\n\nGPU Node Affinity UUID Cores") + for i in range(len(node_data)): + print(i, Tb, node_data[i], Tb, affinity_data[i], Tb, hip_uuid[i], Tb, numa_cpus[affinity_data[i]]) + sys.exit(0) + +if __name__ == "__main__": + sysPos, debug_numa, use_taskset, use_numactl, use_nobind, dry_run, use_md, md_count, use_nr, use_nm, dump_topo, use_rocmsmi = processArgs() + #check for numactl and taskset + has_numactl = check_numactl_exists() + has_taskset = check_taskset_exists() + + #get topo info + if use_taskset or dump_topo: numa_cpus = parse_lscpu_numa() + if noAmdSmi or use_rocmsmi: + affinity_data, node_data, hip_uuid = parse_rocm_smi_toponuma() + else: + affinity_data, node_data, hip_uuid = get_amd_smi_static_numa() + + if debug_numa > 1: print(affinity_data, node_data, hip_uuid) + if dump_topo: dumpTopology(affinity_data, node_data, hip_uuid, numa_cpus) + + numGpus = len(node_data) + rocrVisDev = int(os.environ.get('ROCR_VISIBLE_DEVICES', '-1')) + localRank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', '0')) + numRanksLocal = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_SIZE', '1')) + + # support override by envvar + gpurun_bypass = int(os.environ.get('GPURUN_BYPASS', '0')) + if gpurun_bypass: + use_taskset = False + use_numactl = False + use_nobind = True + + if rocrVisDev != -1 or use_nr: + adjRank = rocrVisDev + elif use_nm or use_numactl: + adjRank = localRank % numGpus + else: + adjRank=0 + if debug_numa > 1: + print("#GPUs ", numGpus, "numRanks", numRanksLocal, "localRank", localRank, "adjRank", adjRank, "RVD", rocrVisDev) + if debug_numa > 2: + if affinity_data is not None and node_data is not None: + print("\nGPU Affinity:") + for i, affinity in enumerate(affinity_data): + if affinity is not None: + print(f" GPU {i}: Affinity = {affinity}") + + print("\n GPU NUMA Nodes:") + for i, node in enumerate(node_data): + if node is not None: + print(f" GPU {i}: NUMA Node = {node}") + + my_env = os.environ.copy() + if use_md: + my_env["ROCR_VISIBLE_DEVICES"] = "0,1" + else: + my_env["ROCR_VISIBLE_DEVICES"] = str(adjRank) + if use_taskset and has_taskset: + program_to_run = [ "taskset", "-c", numa_cpus[node_data[adjRank]]] + elif use_numactl and has_numactl: + program_to_run = [ "numactl", "--cpunodebind", str(node_data[adjRank]), "--membind", str(affinity_data[adjRank]) ] + elif use_nobind: + program_to_run = [ ] + else: + program_to_run = [ ] + program_to_run.extend(sys.argv[sysPos:]) + if debug_numa > 0 or dry_run: print("ROCR_VISIBLE_DEVICES", my_env["ROCR_VISIBLE_DEVICES"], " ", program_to_run) + if not dry_run: result = subprocess.run(program_to_run, env=my_env, capture_output=False, text=False, check=False) diff --git a/offload/utils/gpurun/gpurun-old b/offload/utils/gpurun/gpurun-old new file mode 100755 index 0000000000000..870bc7a8ccbcd --- /dev/null +++ b/offload/utils/gpurun/gpurun-old @@ -0,0 +1,697 @@ +#!/bin/bash +# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +# of the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +# gpurun: Process launch utility for GPU applications. This is a wrapper +# to execute application binaries including OpenMPI GPU applications. +# See help message below (gpurun -h) for more information. +# +# Usage Examples: +# gpurun true +# mpirun -np 4 gpurun env | grep ROCR_VISIBLE_DEVICES +# + +# If set to 1, just invoke the rest of the command line without doing anything +# else. +GPURUN_BYPASS=${GPURUN_BYPASS:-0} + +function execOnError() { + exec "$@" +} + +# PROGVERSION string is updated by cmake when component is installed +PROGVERSION=X.Y-Z +function version(){ + echo $0 version $PROGVERSION + exit 0 +} +function usage(){ +/bin/cat 2>&1 <<"EOF" + + gpurun: Application process launch utility for GPUs. + This utility ensures the process will enable either a single + GPU or the number specified with -md (multi-device) option. + It launches the application binary with either the 'taskset' + or 'numactl' utility so the process only runs on CPU cores + in the same NUMA domain as the selected GPUs. + This utility sets environment variable ROCR_VISIBLE_DEVICES + to selected GPUs ONLY if it was not already set by the + callers environment AND the number of GPUs is not 1. + This utility also sets environment variable HSA_CU_MASK + to control which CUs are available to the process. + HSA_CU_MASK is set only when more than one OpenMPI process + (rank) will utilize the same GPU and it is not preset. + Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the + number of CUs available to the process after masking. + + Usage: + gpurun [ ] + mpirun -np gpurun [ ] + + Options: + -h Print this help message and exit + -md Set number of desired devices for multi-device mode, default=1 + -s suppress output, often useful in benchmarking + -q suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0 + -v Verbose output, same as GPURUN_VERBOSE=1 + -vv Verbose output, same as GPURUN_VERBOSE=2 + -m use numactl membind to CPUs in same NUMA domain. Note: Allocation + fails when not enough memory available on these nodes. + -l use numactl localalloc to CPUs in same NUMA domain. Note: If + memory cannot be allocated, alloc falls back to other nodes. + -nr use numactl ROCR_VISIBLE_DEVICES + -nm use numactl OMPI_COMM_WORLD_LOCAL_RANK + --version Print version of gpurun and exit + + Optional Input environment variables: + GPURUN_VERBOSE + 0: default for silent operation, no trace printed to stderr + 1: -v prints trace record including process launch cmd to stderr + 2: -vv prints trace and other summary diagnostics + ROCMINFO_BINARY Set location of rocminfo binary + AOMP: location of AOMP or ROCM + GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0. + This only works for single device mode. + GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards + GPURUN_MASK_POLICY : useful if machine has different GPU cards + ROCR_VISIBLE_DEVICES: See description above + OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi + OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi + This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID + and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK + + Generated (output) Environment Variables: + OMPX_TARGET_TEAM_PROCS - Number of CUs available to process + ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset + HSA_CU_MASK - The CU mask for the device. + LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument + GPU_MAX_HW_QUEUES + LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" + + Limitations: + - Currently, gpurun creates masks that are mutually exclusive of each other. + That is, the MPI processes will not share CUs. If number of ranks is not + perfectly divisible by number of CUs or number of GPUs, some resources + would be unused. + Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization. + - Works with AOMP 19.0-0 or ROCM 6.1 or greater + - cu masking is not available when multiple devices per process are enabled + with -md option (multi-device) mode. + + Notes: + With MPI, this utility distributes GPUs and their CUs across + multiple ranks of an MPI job into mutually exclusive sets of CUs. + It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE + and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a + the mutually exclusive CU mask. + + An rplace (rank place) is a subset of CUs for a rank. + gpurun calculates the number of rplaces needed to contain all + the specified number of ranks for this node. If number of ranks not + divisible by number of GPUs, then there will be more rplaces than ranks. + The number of CUs in an rplace is calculated by dividing the number of + CUs per GPU by the number of rplaces per GPU. This is also the number of + bits set in the CU mask. This is also the number of physical locations + available for an OpenMP team to execute. This utility exports that number + to the environment variable OMPX_TARGET_TEAM_PROCS. This value + could be used by the application or runtume to adjust the number + of desired teams in a target region. If no masking occurs, the entire + GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to + the total number of CUs on the GPU. + + Copyright (c) 2024 ADVANCED MICRO DEVICES, INC. + +EOF + exit 0 +} + +_end_gpurun_opts=0 +_devices_per_mdset=1 +_uses_multi_device=0 +while [ "$_end_gpurun_opts" == "0" ] ; do + case "$1" in + -s) GPURUN_VERBOSE=0;; + -q) GPURUN_VERBOSE=0;; + --quiet) GPURUN_VERBOSE=0;; + -h) usage ;; + -help) usage ;; + --help) usage ;; + -version) version ;; + --version) version ;; + -v) GPURUN_VERBOSE=1;; + -vv) GPURUN_VERBOSE=2;; + -m) _use_numactl_membind=1;; + -md) shift; _devices_per_mdset=$1; _uses_multi_device=1;; + -nr) _use_numactl_rocr=1;; + -nm) _use_numactl_ompi=1;; + -l) _use_numactl_localalloc=1;; + -nomask) GPURUN_MASK_POLICY="nomask";; + *) _end_gpurun_opts=1; break;; + esac + if [ "$_end_gpurun_opts" == "0" ] ; then + shift + fi +done + +if [ "$GPURUN_BYPASS" = "1" ]; then + execOnError "$@" +fi + +# Default: quiet operation +GPURUN_VERBOSE=${GPURUN_VERBOSE:-0} +# Default: create mutually exclusive sets of CUs when GPU is oversubscribed +GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex} +# switch mask policy to preset if HSA_CU_MASK was preset +[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset +# switch mask policy to nomask for multi-device +[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask +# Offset selected device to avoid some heavily used GPUs +GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0} + +# Get environment variables set by OpenMPI +_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE +_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK +# If not OpenMPI, check for Platform MPI, MVAPICH +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_LOCALNRANKS + _local_rank_num=$MPI_LOCALRANKID +fi +# Also try MPI_COMM_WORLD env vars +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE + _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK +fi +# Check if SLURM was used +if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then + _num_local_ranks=$SLURM_CPUS_ON_NODE + _local_rank_num=$SLURM_LOCALID +fi + +if [ "$_use_numactl_rocr" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $ROCR_VISIBLE_DEVICES --membind $ROCR_VISIBLE_DEVICES $* + exit $? + else + $* + exit $? + fi +fi +if [ "$_use_numactl_ompi" == "1" ] ; then + _cmd_binary=`which numactl` + if [ $? == 0 ] ; then + numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK --membind $OMPI_COMM_WORLD_LOCAL_RANK $* + exit $? + else + $* + exit $? + fi +fi +# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU +if [ -z "$_num_local_ranks" ] ; then + _num_local_ranks=1 + _local_rank_num=0 +fi + +# Find location of the rocminfo binary +AOMP=${AOMP:-_AOMP_INSTALL_DIR_} +if [ ! -d $AOMP ] ; then + AOMP="_AOMP_INSTALL_DIR_" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/lib/llvm" +fi +if [ ! -d $AOMP ] ; then + AOMP="/opt/rocm/llvm" +fi +if [ ! -d $AOMP ] ; then + realpath=`realpath $0` + thisdir=`dirname $realpath` + AOMP=$thisdir/.. +fi +if [ ! -d $AOMP ] ; then + >&2 echo "ERROR: AOMP not found at $AOMP" + >&2 echo " Please install AOMP or correctly set env-var AOMP" + execOnError "$@" +fi +ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo} +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo +[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo +if [ ! -f $ROCMINFO_BINARY ] ; then + >&2 echo "ERROR: Could not find binary for rocminfo," + >&2 echo " Please correct installation of ROCM or AOMP compiler" + execOnError "$@" +fi + +# Use rocminfo to find number number of CUs and gfxids for each GPU. +_tfile="/tmp/rinfo_out$$" +$ROCMINFO_BINARY 2>/dev/null | grep -E " Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile +_tfile_lines=`wc -l $_tfile | cut -d" " -f1` +if [ $_tfile_lines == 0 ] ; then + >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices" + rm $_tfile + execOnError "$@" +fi +# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device +_ri_all_gfxids="" +_ri_gfxids=() +_ri_cucount=() +_ri_bdfids=() +_ri_dev_idx=() +_ri_num_devices=0 +_last_cu_count=0 +_ri_uuid=() +_last_device_type_was_gpu=0 +_device_type_preset=0 +_ri_num_all_devices=0 +[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1 +while read _linepair ; do + _fieldvalue=`echo $_linepair | cut -d":" -f2` + _fieldtype=`echo $_linepair | cut -d":" -f1` + if [ $_fieldvalue == "CPU" ] ; then + _last_device_type_was_gpu=0 + elif [ $_fieldvalue == "GPU" ] ; then + _last_device_type_was_gpu=1 + elif [ "$_fieldtype" == "Uuid" ] ; then + _this_uuid=$_fieldvalue + elif [ "$_fieldtype" == "BDFID" ] ; then + if [[ $_last_device_type_was_gpu == 1 ]] ; then + # _domain="$(echo "$_fieldvalue / (2^32)" | bc)" + _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)" + _devfn="$(echo "($_fieldvalue % (2^8))" | bc)" + _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")" + fi + elif [ "$_fieldtype" == "Name" ] ; then + # The device name field is last in rocminfo output, so we can create new _ri_ array entry + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _this_gfxid=`echo $_fieldvalue | cut -d'-' -f5` + ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid" + _is_type_visible=1 + if [ $_device_type_preset == 1 ] ; then + _is_type_visible=0 + if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then + _is_type_visible=1 + fi + fi + if [ $_is_type_visible == 1 ] ; then + _ri_gfxids+=( $_this_gfxid ) + _ri_cucount+=( $_last_cu_count ) + _ri_bdfids+=( $_bdfidstr ) + _ri_dev_idx+=( $_ri_num_all_devices ) + _ri_uuid+=( $_this_uuid ) + _ri_num_devices=$(( $_ri_num_devices + 1 )) + fi + _ri_num_all_devices=$(( $_ri_num_all_devices + 1 )) + fi + else + # else the _fieldvalue was the number of CUs or GCPUs + if [[ $_last_device_type_was_gpu == 1 ]] ; then + _last_cu_count=$_fieldvalue + fi + fi +done < $_tfile +rm $_tfile + +if [ $_ri_num_devices == 0 ] ; then + if [ $_local_rank_num == 0 ] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY" + fi + if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then + >&2 echo " ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES" + >&2 echo " Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly." + fi + execOnError "$@" + else + execOnError "$@" + fi +fi + +# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per +# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids, +# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information +# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above +# by scanning output from rocminfo. +_sysdevdir="/sys/bus/pci/devices" +_ss_num_devices=0 +_ss_cpulist=() +_ss_bdfid=() +_ss_numanode=() +_ss_uuid=() +_ss_gfxid=() +_ss_cucount=() +for _devid in `ls $_sysdevdir` ; do + if [ -f $_sysdevdir/$_devid/device ] ; then + _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'` + if [ ! -z $_driver_name ] ; then + if [ $_driver_name == "DRIVER=amdgpu" ] ; then + _numa_node=`cat $_sysdevdir/$_devid/numa_node` + [ "$_numa_node" == "-1" ] && _numa_node=0 + _this_uuid=0 + if [ -f $_sysdevdir/$_devid/unique_id ] ; then + _this_uuid=`cat $_sysdevdir/$_devid/unique_id` + if [ -z $_this_uuid ] ; then + _this_uuid=0 + _has_unique_id_file=0 + else + _this_uuid="GPU-$_this_uuid" + _has_unique_id_file=1 + fi + fi + _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist` + _match_uuid_count=0 + for _ri_i in ${!_ri_bdfids[@]} ; do + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + if [ $_ss_value == $_ri_value ] ; then + _match_uuid_count=$(( $_match_uuid_count + 1 )) + fi + done + # Search _ri_ arrays for matching uuids or matching bdfids. + for _ri_i in ${!_ri_bdfids[@]} ; do + if [ "$_has_unique_id_file" == "1" ] ; then + _ss_value=$_this_uuid + _ri_value=${_ri_uuid[$_ri_i]} + elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then + # Under Hyper-V, we may see a zero BDFID. Fall back to UUID. + _ss_value=$_devid + _ri_value=$_devid + else + _ss_value=$_devid + _ri_value="0000:${_ri_bdfids[$_ri_i]}.0" + fi + if [ $_ss_value == $_ri_value ] ; then + if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then + # Some GPUs do not have unique_id or TPX mode creates multiple + # identical uuids, so use device index for RVD + _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} ) + else + _ss_uuid+=( $_this_uuid ) + fi + _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} ) + _ss_cucount+=( ${_ri_cucount[$_ri_i]} ) + _ss_bdfid+=( $_devid ) + _ss_numanode+=( $_numa_node ) + _ss_cpulist+=( $_this_cpulist ) + _ss_num_devices=$(( $_ss_num_devices + 1 )) + fi + done + fi + fi + fi +done + +if [[ $_ss_num_devices -lt 1 ]] ; then + if [ $_device_type_preset == 1 ] ; then + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES." + >&2 echo " Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}" + else + >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir." + fi + execOnError "$@" +fi + +# check for taskset or numactl cmd +if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd_binary=`which numactl` + if [ $? != 0 ] ; then + >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed." + execOnError "$@" + fi +else + _launch_process_cmd_binary=`which taskset` + if [ $? != 0 ] ; then + >&2 echo "ERROR: $0 requires the taskset command to be installed." + execOnError "$@" + fi +fi +if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then + >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored." + _use_numactl_membind=0 +fi + +_utilized_devices=$_ri_num_devices +[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks + +# Calculate number of GPUs to use to evenly spread ranks across GPUs. +# An rplace is a set of CUs that will be used for a rank. +# The number of rplaces must be at least the number of ranks. +_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices )) +_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices )) +if [ $_uncovered_ranks != 0 ] ; then + # If _num_local_ranks not divisible by number of GPUs, + # then add an extra rplace per GPU to make room for remainder. + _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 )) +fi +if [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # For mutex policy, adjacent ranks are assigned to the same device. + _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU )) + # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS + _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +else + # for mask policies nomask or preset, adjacent ranks are assigned to + # different GPUs and oversubscribed ranks are assigned round robin + _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices )) +fi + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +if [ $_uses_multi_device == 1 ]; then + # Enforce some rules on the use of -md option + # Note -md forces GPURUN_MASK_POLICY=nomask + if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then + >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode" + execOnError "$@" + fi + if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then + >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)" + execOnError "$@" + fi + _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset )) + if [ $_md_total_devices -gt $_ri_num_devices ] && [ $_local_rank_num == 0 ] ; then + printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n Some multi-device sets will overlap.\n" >&2 + fi + _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices)) + _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 )) + + # merge entries for this mdset from per device arrays + _md_bdfs="" + _md_cpus="" + _md_nns="" + _md_uuids="" + _md_dev_idxs="" + _sep="" + for i in `seq $_md_device_set_start $_md_device_set_end` ; do + _dev_index=$i + # handle index wrap around number of devices + [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices )) + _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]} + _new_nn=${_ss_numanode[$_dev_index]} + SAVEIFS=$IFS + IFS="," + _found=0 + for _existing_nn in $_md_nns ; do + [ $_existing_nn == $_new_nn ] && _found=1 + done + IFS=$SAVEIFS + if [ $_found == 0 ] ; then + # only add new numa node and cpulist, if not already in the md set + _md_nns+=$_sep$_new_nn + _md_cpus+=$_sep${_ss_cpulist[$_dev_index]} + fi + _md_uuids+=$_sep${_ss_uuid[$_dev_index]} + _md_dev_idxs+=$_sep$_dev_index + _sep="," + done + _device_num=$_md_device_set_start +fi + +_available_CUs_per_device=${_ss_cucount[$_device_num]} +_gfxid=${_ss_gfxid[$_device_num]} + +_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} )) +if [ $_num_local_ranks -gt $_node_cus ] ; then + >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks " + execOnError "$@" +fi + +_utilized_CUs_per_device=$_available_CUs_per_device +_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +# Lower utilized CUs till divisible by number of rplaces per GPU +while [ $_rem2 != 0 ] ; do + _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 )) + _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU )) +done +_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU )) + +# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0 +if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then + if [ $_uses_multi_device == 0 ] ; then + _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device )) + _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices )) + _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks )) + _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) + _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace )) + _utilization=$(( ( $_used_cus * 100 ) / $_node_cus )) + if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then + if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then + _extra_diags=true + fi + fi + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + [ $_extra_diags ] && \ + >&2 echo "-- USED GPUS: $(( $_ri_num_devices - $_wasted_GPUs ))" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED GPUS: $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) " + [ $_extra_diags ] && echo + >&2 echo "- RPLACEs PER NODE: $_total_GPU_rplaces" + >&2 echo "- RPLACEs PER GPU: $_number_of_rplaces_per_GPU" + [ $_extra_diags ] && \ + >&2 echo "-- USED RPLACEs: $_num_local_ranks (RANKS)" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED RPLACEs: $_total_wasted_rplaces" ; \ + >&2 echo "- gfxids ${_ss_gfxid[@]}" + >&2 echo "- CUs PER GPU: ${_ss_cucount[@]}" + [ $_extra_diags ] && \ + >&2 echo "-- USED on CUs RANK0: $_utilized_CUs_per_device" + [ $_extra_diags ] && \ + >&2 echo "-- UNUSED CUs RANK0 : $_wasted_CUs_on_each_GPU" + >&2 echo "- CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)" + >&2 echo "- FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU" + if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then + >&2 echo "- Preset ROCR_VISIBLE_DEVICES: $ROCR_VISIBLE_DEVICES" + fi + if [[ ! -z "$HSA_CU_MASK" ]] ; then + # node utilizatino could be incorrect with preset cumask. + >&2 echo "- Preset HSA_CU_MASK: $HSA_CU_MASK" + else + >&2 echo "- NODE UTILIZATION: $_utilization %" + fi + else + >&2 echo "- ROCMINFO LOCATION: $ROCMINFO_BINARY" + >&2 echo "- PROCESSES: $_num_local_ranks (RANKS)" + >&2 echo "- AVAILABLE GPUS: $_ri_num_devices" + >&2 echo "- DEVS PER RANK: $_devices_per_mdset" + >&2 echo "- MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)" + _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices )) + >&2 echo "- NODE UTILIZATION: $_md_utilization %" + fi +fi +# --- END OF DIAGNOSTIC BLOCK + +if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then + # Build the CU mask for this rank, bits_to_set = _CUs_per_rplace + _bits_to_set=$_CUs_per_rplace + # This formula keeps adjacent ranks on same GPU which should be preferred + _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) )) + # use bc because these values can be very large + _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc` + _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc` + # Calculate the number of leading zeros needed for this mask + _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 )) + for i in `seq 1 $_lz` ; do + _mask="0$_mask" + done + _mask="0x$_mask" +fi + +_launch_process_cmd="" +if [ $_uses_multi_device == 0 ] ; then + # retrieve scanned info from per device arrays + _bdfidstrc=${_ss_bdfid[$_device_num]} + NUMANODE=${_ss_numanode[$_device_num]} + _list_of_cpu_cores=${_ss_cpulist[$_device_num]} + _this_uuid=${_ss_uuid[$_device_num]} +else + # Use multi-device values + _bdfidstrc=$_md_bdfs + NUMANODE=$_md_nns + _list_of_cpu_cores=$_md_cpus + _this_uuid=$_md_uuids + _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset " +fi +if [ "$_use_numactl_localalloc" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE" +elif [ "$_use_numactl_membind" == "1" ] ; then + _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE" +else + _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores" +fi + +# If gpurun was not given command to execute, then dont run _launch_process_cmd +[ "$*" == "" ] && _launch_process_cmd="" + +# only set ROCR_VISIBLE_DEVICES if not already set +if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then + export ROCR_VISIBLE_DEVICES=$_this_uuid + _log_word="RVD" +else + _log_word="PRESET-RVD" +fi + +export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace + +# - Limit HSA queues when multiple ranks per GPU +if [ $_number_of_rplaces_per_GPU != 1 ] ; then + # Only set these env controls if not set by caller + [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1 + [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1 +fi + +[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0" ]] && \ + [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK" + +if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then + # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution. + if [ "$GPURUN_VERBOSE" != "0" ] ; then + if [ $_uses_multi_device == 1 ] ; then + printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n CMD:$_launch_process_cmd $*\n" >&2 + else + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2 + fi + fi + $_launch_process_cmd $* + # --- end code block +else + # --- HSA_CU_MASK is required in this code block, assumes no multi-device + if [[ -z "$HSA_CU_MASK" ]] ; then + # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0: + export HSA_CU_MASK=0:$_mask + else + # use preset mask + _mask=$HSA_CU_MASK + fi + if [ "$GPURUN_VERBOSE" != "0" ] ; then + printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2 + fi + HSA_CU_MASK=0:$_mask \ + $_launch_process_cmd $* + # --- end code block +fi +exit $?