diff --git a/offload/utils/gpurun/CMakeLists.txt b/offload/utils/gpurun/CMakeLists.txt
index 0483a5737b830..3d85b681c90d1 100644
--- a/offload/utils/gpurun/CMakeLists.txt
+++ b/offload/utils/gpurun/CMakeLists.txt
@@ -1 +1,2 @@
 add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun)
+add_openmp_util(${CMAKE_CURRENT_SOURCE_DIR}/gpurun-old)
diff --git a/offload/utils/gpurun/gpurun b/offload/utils/gpurun/gpurun
index 870bc7a8ccbcd..679269ce5c246 100755
--- a/offload/utils/gpurun/gpurun
+++ b/offload/utils/gpurun/gpurun
@@ -1,697 +1,374 @@
-#!/bin/bash
-# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#!/usr/bin/env python3
 #
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-# of the Software, and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
+# Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
 #
-#  gpurun: Process launch utility for GPU applications. This is a wrapper
-#          to execute application binaries including OpenMPI GPU applications.
-#          See help message below (gpurun -h) for more information.
+#   gpurun: Application process launch utility for GPUs.
+#           This utility ensures the process will enable either a single
+#           GPU or the number specified with -md (multi-device) option.
+#           It launches the application binary with either the 'taskset'
+#           or 'numactl' utility so the process only runs on CPU cores
+#           in the same NUMA domain as the selected GPUs.
 #
-#  Usage Examples:
-#    gpurun true
-#    mpirun -np  4 gpurun env | grep ROCR_VISIBLE_DEVICES
+#           This utility sets environment variable ROCR_VISIBLE_DEVICES
+#           to selected GPUs ONLY if it was not already set by the
+#           callers environment AND the number of GPUs is not 1.
 #
-
-# If set to 1, just invoke the rest of the command line without doing anything
-# else.
-GPURUN_BYPASS=${GPURUN_BYPASS:-0}
-
-function execOnError() {
-   exec "$@"
-}
-
-# PROGVERSION string is updated by cmake when component is installed
-PROGVERSION=X.Y-Z
-function version(){
-   echo $0 version $PROGVERSION
-   exit 0
-}
-function usage(){
-/bin/cat 2>&1 <<"EOF"
-
-   gpurun: Application process launch utility for GPUs.
-           This utility ensures the process will enable either a single
-	   GPU or the number specified with -md (multi-device) option.
-           It launches the application binary with either the 'taskset'
-           or 'numactl' utility so the process only runs on CPU cores
-           in the same NUMA domain as the selected GPUs.
-           This utility sets environment variable ROCR_VISIBLE_DEVICES
-	   to selected GPUs ONLY if it was not already set by the
-	   callers environment AND the number of GPUs is not 1.
-           This utility also sets environment variable HSA_CU_MASK
-           to control which CUs are available to the process.
-	   HSA_CU_MASK is set only when more than one OpenMPI process
-	   (rank) will utilize the same GPU and it is not preset.
-           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
-           number of CUs available to the process after masking.
-
-   Usage:
-      gpurun <executable> [ <executable args> ]
-      mpirun -np <num ranks>  gpurun <executable> [ <executable args> ]
-
-   Options:
-      -h   Print this help message and exit
-      -md  Set number of desired devices for multi-device mode, default=1
-      -s   suppress output, often useful in benchmarking
-      -q   suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0
-      -v   Verbose output, same as GPURUN_VERBOSE=1
-      -vv  Verbose output, same as GPURUN_VERBOSE=2
-      -m   use numactl membind to CPUs in same NUMA domain. Note: Allocation
-           fails when not enough memory available on these nodes.
-      -l   use numactl localalloc to CPUs in same NUMA domain. Note: If
-           memory cannot be allocated, alloc falls back to other nodes.
-      -nr  use numactl ROCR_VISIBLE_DEVICES
-      -nm  use numactl OMPI_COMM_WORLD_LOCAL_RANK
-      --version Print version of gpurun and exit
-
-   Optional Input environment variables:
-      GPURUN_VERBOSE
-        0:  default for silent operation, no trace printed to stderr
-        1:  -v prints trace record including process launch cmd to stderr
-        2:  -vv prints trace and other summary diagnostics
-      ROCMINFO_BINARY  Set location of rocminfo binary
-      AOMP: location of AOMP or ROCM
-      GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0.
-                          This only works for single device mode.
-      GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards
-      GPURUN_MASK_POLICY : useful if machine has different GPU cards
-      ROCR_VISIBLE_DEVICES: See description above
-      OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi
-      OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi
-      This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID
-      and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK
-
-   Generated (output) Environment Variables:
-      OMPX_TARGET_TEAM_PROCS - Number of CUs available to process
-      ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset
-      HSA_CU_MASK - The CU mask for the device.
-      LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument
-      GPU_MAX_HW_QUEUES
-      LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES"
-
-   Limitations:
-   - Currently, gpurun creates masks that are mutually exclusive of each other.
-     That is, the MPI processes will not share CUs. If number of ranks is not
-     perfectly divisible by number of CUs or number of GPUs, some resources
-     would be unused.
-     Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization.
-   - Works with AOMP 19.0-0 or ROCM 6.1 or greater
-   - cu masking is not available when multiple devices per process are enabled
-     with -md option (multi-device) mode.
-
-   Notes:
-     With MPI, this utility distributes GPUs and their CUs across
-     multiple ranks of an MPI job into mutually exclusive sets of CUs.
-     It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE
-     and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a
-     the mutually exclusive CU mask.
-
-     An rplace (rank place) is a subset of CUs for a rank. 
-     gpurun calculates the number of rplaces needed to contain all
-     the specified number of ranks for this node. If number of ranks not
-     divisible by number of GPUs, then there will be more rplaces than ranks.
-     The number of CUs in an rplace is calculated by dividing the number of
-     CUs per GPU by the number of rplaces per GPU. This is also the number of
-     bits set in the CU mask. This is also the number of physical locations
-     available for an OpenMP team to execute. This utility exports that number
-     to the environment variable OMPX_TARGET_TEAM_PROCS. This value
-     could be used by the application or runtume to adjust the number
-     of desired teams in a target region. If no masking occurs, the entire
-     GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to
-     the total number of CUs on the GPU.
-
-   Copyright (c) 2024  ADVANCED MICRO DEVICES, INC.
-
-EOF
-  exit 0
-}
-
-_end_gpurun_opts=0
-_devices_per_mdset=1
-_uses_multi_device=0
-while [ "$_end_gpurun_opts" == "0"  ] ; do
-   case "$1" in
-      -s)          GPURUN_VERBOSE=0;;
-      -q)          GPURUN_VERBOSE=0;;
-      --quiet)     GPURUN_VERBOSE=0;;
-      -h)          usage ;;
-      -help)       usage ;;
-      --help)      usage ;;
-      -version)    version ;;
-      --version)   version ;;
-      -v)          GPURUN_VERBOSE=1;;
-      -vv)         GPURUN_VERBOSE=2;;
-      -m)          _use_numactl_membind=1;;
-      -md)         shift; _devices_per_mdset=$1; _uses_multi_device=1;;
-      -nr)          _use_numactl_rocr=1;;
-      -nm)          _use_numactl_ompi=1;;
-      -l)          _use_numactl_localalloc=1;;
-      -nomask)     GPURUN_MASK_POLICY="nomask";;
-      *)           _end_gpurun_opts=1; break;;
-   esac
-   if [ "$_end_gpurun_opts" == "0" ] ; then
-     shift
-   fi
-done
-
-if  [ "$GPURUN_BYPASS" = "1" ]; then
-  execOnError "$@"
-fi
-
-# Default: quiet operation
-GPURUN_VERBOSE=${GPURUN_VERBOSE:-0}
-# Default: create mutually exclusive sets of CUs when GPU is oversubscribed
-GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex}
-# switch mask policy to preset if HSA_CU_MASK was preset
-[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset
-# switch mask policy to nomask for multi-device
-[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask
-# Offset selected device to avoid some heavily used GPUs
-GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0}
-
-#  Get environment variables set by OpenMPI
-_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE
-_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK
-# If not OpenMPI, check for Platform MPI, MVAPICH
-if [ -z "$_num_local_ranks" ] ; then
-   _num_local_ranks=$MPI_LOCALNRANKS
-   _local_rank_num=$MPI_LOCALRANKID
-fi
-# Also try MPI_COMM_WORLD env vars
-if [ -z "$_num_local_ranks" ] ; then
-   _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE
-   _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK
-fi
-# Check if SLURM was used
-if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then
-   _num_local_ranks=$SLURM_CPUS_ON_NODE
-   _local_rank_num=$SLURM_LOCALID
-fi
-
-if [ "$_use_numactl_rocr"  == "1" ] ; then
-  _cmd_binary=`which numactl`
-  if [ $? == 0 ] ; then
-    numactl --cpunodebind $ROCR_VISIBLE_DEVICES  --membind $ROCR_VISIBLE_DEVICES $*
-    exit $?
-  else
-    $*
-    exit $?
-  fi
-fi
-if [ "$_use_numactl_ompi" == "1" ] ; then
-  _cmd_binary=`which numactl`
-  if [ $? == 0 ] ; then
-    numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK  --membind $OMPI_COMM_WORLD_LOCAL_RANK $*
-    exit $?
-  else
-    $*
-    exit $?
-  fi
-fi
-# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU
-if [ -z "$_num_local_ranks" ] ; then
-   _num_local_ranks=1
-   _local_rank_num=0
-fi
-
-# Find location of the rocminfo binary
-AOMP=${AOMP:-_AOMP_INSTALL_DIR_}
-if [ ! -d $AOMP ] ; then
-   AOMP="_AOMP_INSTALL_DIR_"
-fi
-if [ ! -d $AOMP ] ; then
-   AOMP="/opt/rocm/lib/llvm"
-fi
-if [ ! -d $AOMP ] ; then
-   AOMP="/opt/rocm/llvm"
-fi
-if [ ! -d $AOMP ] ; then
-   realpath=`realpath $0`
-   thisdir=`dirname $realpath`
-   AOMP=$thisdir/..
-fi
-if [ ! -d $AOMP ] ; then
-   >&2 echo "ERROR: AOMP not found at $AOMP"
-   >&2 echo "       Please install AOMP or correctly set env-var AOMP"
-   execOnError "$@"
-fi
-ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
-[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
-[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo
-if [ ! -f $ROCMINFO_BINARY ] ; then
-   >&2 echo "ERROR: Could not find binary for rocminfo,"
-   >&2 echo "       Please correct installation of ROCM or AOMP compiler"
-   execOnError "$@"
-fi
-
-# Use rocminfo to find number number of CUs and gfxids for each GPU.
-_tfile="/tmp/rinfo_out$$"
-$ROCMINFO_BINARY 2>/dev/null | grep -E "    Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile
-_tfile_lines=`wc -l $_tfile | cut -d" " -f1`
-if [ $_tfile_lines == 0 ] ; then
-  >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices"
-  rm $_tfile
-  execOnError "$@"
-fi
-# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device
-_ri_all_gfxids=""
-_ri_gfxids=()
-_ri_cucount=()
-_ri_bdfids=()
-_ri_dev_idx=()
-_ri_num_devices=0
-_last_cu_count=0
-_ri_uuid=()
-_last_device_type_was_gpu=0
-_device_type_preset=0
-_ri_num_all_devices=0
-[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1
-while read _linepair ; do
-  _fieldvalue=`echo $_linepair | cut -d":" -f2`
-  _fieldtype=`echo $_linepair | cut -d":" -f1`
-  if [ $_fieldvalue == "CPU" ] ; then
-     _last_device_type_was_gpu=0
-  elif [ $_fieldvalue == "GPU" ] ; then
-     _last_device_type_was_gpu=1
-  elif [ "$_fieldtype" == "Uuid" ] ; then
-     _this_uuid=$_fieldvalue
-  elif [ "$_fieldtype" == "BDFID" ] ; then
-     if [[ $_last_device_type_was_gpu == 1 ]] ; then
-        # _domain="$(echo "$_fieldvalue / (2^32)" | bc)"
-        _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)"
-        _devfn="$(echo "($_fieldvalue % (2^8))" | bc)"
-        _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")"
-     fi
-  elif [ "$_fieldtype" == "Name" ] ; then
-     #  The device name field is last in rocminfo output, so we can create new _ri_ array entry
-     if [[ $_last_device_type_was_gpu == 1 ]] ; then
-	_this_gfxid=`echo $_fieldvalue | cut -d'-' -f5`
-        ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid"
-        _is_type_visible=1
-	if [ $_device_type_preset == 1 ] ; then
-           _is_type_visible=0
-           if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then
-	     _is_type_visible=1
-	   fi
-	fi
-        if [ $_is_type_visible == 1 ] ; then
-           _ri_gfxids+=( $_this_gfxid )
-           _ri_cucount+=( $_last_cu_count )
-           _ri_bdfids+=( $_bdfidstr )
-	   _ri_dev_idx+=( $_ri_num_all_devices )
-	   _ri_uuid+=( $_this_uuid )
-           _ri_num_devices=$(( $_ri_num_devices + 1 ))
-	fi
-        _ri_num_all_devices=$(( $_ri_num_all_devices + 1 ))
-     fi
-  else
-     # else the _fieldvalue was the number of CUs or GCPUs
-     if [[ $_last_device_type_was_gpu == 1 ]] ; then
-        _last_cu_count=$_fieldvalue
-     fi
-  fi
-done < $_tfile
-rm $_tfile
-
-if [ $_ri_num_devices == 0 ] ; then
-   if [ $_local_rank_num == 0 ] ; then
-      if [ $_device_type_preset == 1 ] ; then
-         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES."
-         >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
-      else
-         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY"
-      fi
-      if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then
-         >&2 echo "       ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES"
-         >&2 echo "       Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly."
-      fi
-      execOnError "$@"
-   else
-      execOnError "$@"
-   fi
-fi
-
-# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per
-# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids,
-# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information
-# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above
-# by scanning output from rocminfo.
-_sysdevdir="/sys/bus/pci/devices"
-_ss_num_devices=0
-_ss_cpulist=()
-_ss_bdfid=()
-_ss_numanode=()
-_ss_uuid=()
-_ss_gfxid=()
-_ss_cucount=()
-for _devid in `ls $_sysdevdir` ; do
-   if [ -f $_sysdevdir/$_devid/device ] ; then
-      _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'`
-      if [ ! -z $_driver_name ] ; then
-         if [ $_driver_name  == "DRIVER=amdgpu" ] ; then
-            _numa_node=`cat $_sysdevdir/$_devid/numa_node`
-            [ "$_numa_node" == "-1" ] && _numa_node=0
-            _this_uuid=0
-	    if [ -f $_sysdevdir/$_devid/unique_id ] ; then
-               _this_uuid=`cat $_sysdevdir/$_devid/unique_id`
-	       if [ -z $_this_uuid ] ; then
-                  _this_uuid=0
-		  _has_unique_id_file=0
-	       else
-                  _this_uuid="GPU-$_this_uuid"
-		  _has_unique_id_file=1
-	       fi
-	    fi
-            _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist`
-	    _match_uuid_count=0
-	    for _ri_i in ${!_ri_bdfids[@]} ; do
-               _ss_value=$_this_uuid
-               _ri_value=${_ri_uuid[$_ri_i]}
-               if [ $_ss_value == $_ri_value ] ; then
-                  _match_uuid_count=$(( $_match_uuid_count + 1 ))
-	       fi
-	    done
-            # Search _ri_ arrays for matching uuids or matching bdfids.
-	    for _ri_i in ${!_ri_bdfids[@]} ; do
-	       if [ "$_has_unique_id_file" == "1" ] ; then
-                  _ss_value=$_this_uuid
-                  _ri_value=${_ri_uuid[$_ri_i]}
-               elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then
-                  # Under Hyper-V, we may see a zero BDFID.  Fall back to UUID.
-                  _ss_value=$_devid
-                  _ri_value=$_devid
-	       else
-                  _ss_value=$_devid
-                  _ri_value="0000:${_ri_bdfids[$_ri_i]}.0"
-               fi
-               if [ $_ss_value == $_ri_value ] ; then
-	          if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then
-	             # Some GPUs do not have unique_id or TPX mode creates multiple
-		     # identical uuids, so use device index for RVD
-                     _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} )
-		  else
-                     _ss_uuid+=( $_this_uuid )
-		  fi
-		  _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} )
-		  _ss_cucount+=( ${_ri_cucount[$_ri_i]} )
-                  _ss_bdfid+=( $_devid )
-                  _ss_numanode+=( $_numa_node )
-                  _ss_cpulist+=( $_this_cpulist )
-                  _ss_num_devices=$(( $_ss_num_devices + 1 ))
-               fi
-            done
-         fi
-      fi
-   fi
-done
-
-if [[ $_ss_num_devices -lt 1  ]] ; then
-   if [ $_device_type_preset == 1 ] ; then
-      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES."
-      >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
-   else
-      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir."
-   fi
-   execOnError "$@"
-fi
-
-# check for taskset or numactl cmd
-if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then
-  _launch_process_cmd_binary=`which numactl`
-  if [ $? != 0 ] ; then
-    >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed."
-    execOnError "$@"
-  fi
-else
-  _launch_process_cmd_binary=`which taskset`
-  if [ $? != 0 ] ; then
-    >&2 echo "ERROR: $0 requires the taskset command to be installed."
-    execOnError "$@"
-  fi
-fi
-if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then
-  >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored."
-  _use_numactl_membind=0
-fi
-
-_utilized_devices=$_ri_num_devices
-[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks
-
-# Calculate number of GPUs to use to evenly spread ranks across GPUs.
-# An rplace is a set of CUs that will be used for a rank.
-# The number of rplaces must be at least the number of ranks.
-_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices ))
-_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices ))
-if [ $_uncovered_ranks != 0 ] ; then
-   # If _num_local_ranks not divisible by number of GPUs,
-   # then add an extra rplace per GPU to make room for remainder.
-   _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 ))
-fi
-if [ $GPURUN_MASK_POLICY == "mutex" ] ; then
-   # For mutex policy, adjacent ranks are assigned to the same device.
-   _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU ))
-   # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS
-   _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
-else
-   # for mask policies nomask or preset, adjacent ranks are assigned to
-   # different GPUs and oversubscribed ranks are assigned round robin
-   _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
-fi
-
-_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
-if [ $_num_local_ranks -gt $_node_cus ] ; then
-   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
-   execOnError "$@"
-fi
-
-if [ $_uses_multi_device == 1 ]; then
-   # Enforce some rules on the use of -md option
-   # Note -md forces GPURUN_MASK_POLICY=nomask
-   if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then
-      >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode"
-      execOnError "$@"
-   fi
-   if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then
-      >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)"
-      execOnError "$@"
-   fi
-   _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset ))
-   if [ $_md_total_devices -gt $_ri_num_devices ] &&  [ $_local_rank_num == 0 ] ; then
-      printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n         Some multi-device sets will overlap.\n" >&2
-   fi
-   _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices))
-   _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 ))
-
-   # merge entries for this mdset from per device arrays
-   _md_bdfs=""
-   _md_cpus=""
-   _md_nns=""
-   _md_uuids=""
-   _md_dev_idxs=""
-   _sep=""
-   for i in `seq $_md_device_set_start $_md_device_set_end` ; do
-      _dev_index=$i
-      # handle index wrap around number of devices
-      [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices ))
-      _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]}
-      _new_nn=${_ss_numanode[$_dev_index]}
-      SAVEIFS=$IFS
-      IFS=","
-      _found=0
-      for _existing_nn in $_md_nns ; do
-         [ $_existing_nn == $_new_nn ] && _found=1
-      done
-      IFS=$SAVEIFS
-      if [ $_found == 0 ] ; then
-	 # only add new numa node and cpulist, if not already in the md set
-         _md_nns+=$_sep$_new_nn
-         _md_cpus+=$_sep${_ss_cpulist[$_dev_index]}
-      fi
-      _md_uuids+=$_sep${_ss_uuid[$_dev_index]}
-      _md_dev_idxs+=$_sep$_dev_index
-      _sep=","
-   done
-   _device_num=$_md_device_set_start
-fi
-
-_available_CUs_per_device=${_ss_cucount[$_device_num]}
-_gfxid=${_ss_gfxid[$_device_num]}
-
-_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
-if [ $_num_local_ranks -gt $_node_cus ] ; then
-   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
-   execOnError "$@"
-fi
-
-_utilized_CUs_per_device=$_available_CUs_per_device
-_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
-# Lower utilized CUs till divisible by number of rplaces per GPU
-while [ $_rem2 != 0 ] ; do
-   _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 ))
-   _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
-done
-_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU ))
-
-# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0
-if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then
-   if [ $_uses_multi_device == 0 ] ; then
-      _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device ))
-      _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices ))
-      _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks ))
-      _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU ))
-      _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace ))
-      _utilization=$(( ( $_used_cus * 100 ) / $_node_cus ))
-      if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then
-         if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then
-            _extra_diags=true
-         fi
-      fi
-      >&2 echo "-  ROCMINFO LOCATION:   $ROCMINFO_BINARY"
-      >&2 echo "-  PROCESSES:           $_num_local_ranks (RANKS)"
-      >&2 echo "-  AVAILABLE GPUS:      $_ri_num_devices"
-      [ $_extra_diags ] && \
-      >&2 echo "-- USED GPUS:           $(( $_ri_num_devices - $_wasted_GPUs ))"
-      [ $_extra_diags ] && \
-      >&2 echo "-- UNUSED GPUS:         $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) "
-      [ $_extra_diags ] && echo
-      >&2 echo "-  RPLACEs PER NODE:    $_total_GPU_rplaces"
-      >&2 echo "-  RPLACEs PER GPU:     $_number_of_rplaces_per_GPU"
-      [ $_extra_diags ] && \
-      >&2 echo "-- USED RPLACEs:        $_num_local_ranks (RANKS)"
-      [ $_extra_diags ] && \
-      >&2 echo "-- UNUSED RPLACEs:      $_total_wasted_rplaces" ; \
-      >&2 echo "-  gfxids               ${_ss_gfxid[@]}"
-      >&2 echo "-  CUs PER GPU:         ${_ss_cucount[@]}"
-      [ $_extra_diags ] && \
-      >&2 echo "-- USED on CUs RANK0:   $_utilized_CUs_per_device"
-      [ $_extra_diags ] && \
-      >&2 echo "-- UNUSED CUs RANK0 :   $_wasted_CUs_on_each_GPU"
-      >&2 echo "-  CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)"
-      >&2 echo "-  FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU"
-      if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then
-         >&2 echo "-  Preset ROCR_VISIBLE_DEVICES:  $ROCR_VISIBLE_DEVICES"
-      fi
-      if [[ ! -z "$HSA_CU_MASK" ]] ; then
-         # node utilizatino could be incorrect with preset cumask.
-         >&2 echo "-  Preset HSA_CU_MASK: $HSA_CU_MASK"
-      else
-         >&2 echo "-  NODE UTILIZATION:  $_utilization %"
-      fi
-   else
-      >&2 echo "-  ROCMINFO LOCATION: $ROCMINFO_BINARY"
-      >&2 echo "-  PROCESSES:         $_num_local_ranks (RANKS)"
-      >&2 echo "-  AVAILABLE GPUS:    $_ri_num_devices"
-      >&2 echo "-  DEVS PER RANK:     $_devices_per_mdset"
-      >&2 echo "-  MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)"
-      _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices ))
-      >&2 echo "-  NODE UTILIZATION:  $_md_utilization %"
-   fi
-fi
-#  --- END OF DIAGNOSTIC BLOCK
-
-if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then
-   #  Build the CU mask for this rank, bits_to_set = _CUs_per_rplace
-   _bits_to_set=$_CUs_per_rplace
-   #  This formula keeps adjacent ranks on same GPU which should be preferred
-   _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) ))
-   # use bc because these values can be very large
-   _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc`
-   _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc`
-   # Calculate the number of leading zeros needed for this mask
-   _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 ))
-   for i in `seq 1 $_lz` ; do
-      _mask="0$_mask"
-   done
-   _mask="0x$_mask"
-fi
-
-_launch_process_cmd=""
-if [ $_uses_multi_device == 0 ] ; then
-   # retrieve scanned info from per device arrays
-   _bdfidstrc=${_ss_bdfid[$_device_num]}
-   NUMANODE=${_ss_numanode[$_device_num]}
-   _list_of_cpu_cores=${_ss_cpulist[$_device_num]}
-   _this_uuid=${_ss_uuid[$_device_num]}
-else
-   # Use multi-device values
-   _bdfidstrc=$_md_bdfs
-   NUMANODE=$_md_nns
-   _list_of_cpu_cores=$_md_cpus
-   _this_uuid=$_md_uuids
-   _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset "
-fi
-if [ "$_use_numactl_localalloc" == "1" ] ; then
-   _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE"
-elif [ "$_use_numactl_membind" == "1" ] ; then
-   _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE"
-else
-   _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores"
-fi
-
-# If gpurun was not given command to execute, then dont run _launch_process_cmd
-[ "$*" == "" ] && _launch_process_cmd=""
-
-# only set ROCR_VISIBLE_DEVICES if not already set
-if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then
-   export ROCR_VISIBLE_DEVICES=$_this_uuid
-   _log_word="RVD"
-else
-   _log_word="PRESET-RVD"
-fi
-
-export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace
-
-#  - Limit HSA queues when multiple ranks per GPU
-if [ $_number_of_rplaces_per_GPU != 1 ] ; then
-   # Only set these env controls if not set by caller
-   [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1
-   [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1
-fi
-
-[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0"  ]] && \
-   [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK"
-
-if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then
-   # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution.
-   if [ "$GPURUN_VERBOSE" != "0" ] ; then
-      if [ $_uses_multi_device == 1 ] ; then
-         printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n     CMD:$_launch_process_cmd $*\n" >&2
-      else
-         printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2
-      fi
-   fi
-   $_launch_process_cmd $*
-   # --- end code block
-else
-   # --- HSA_CU_MASK is required in this code block, assumes no multi-device
-   if [[ -z "$HSA_CU_MASK" ]] ; then
-      # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0:
-      export HSA_CU_MASK=0:$_mask
-   else
-      # use preset mask
-      _mask=$HSA_CU_MASK
-   fi
-   if [ "$GPURUN_VERBOSE" != "0" ] ; then
-      printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2
-   fi
-   HSA_CU_MASK=0:$_mask \
-   $_launch_process_cmd $*
-   # --- end code block
-fi
-exit $?
+#         Future:
+#           This utility also sets environment variable HSA_CU_MASK
+#           to control which CUs are available to the process.
+#           HSA_CU_MASK is set only when more than one OpenMPI process
+#           (rank) will utilize the same GPU and it is not preset.
+#           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
+#           number of CUs available to the process after masking.
+#
+#   $ gpurun -topo
+#   Topology     Numa: 0   PageSize: [always] madvise never
+#
+#   GPU     Node  Affinity       UUID               Cores
+#    0        0       0       GPU-b256278bf70405e2    0-23,96-119
+#    1        1       1       GPU-a33557394e2c744e    24-47,120-143
+#    2        2       2       GPU-4f78640baf57e5f0    48-71,144-167
+#    3        3       3       GPU-b66921701d196e10    72-95,168-191
+
+import subprocess
+import re
+import os
+import sys
+
+if sys.version_info < (3, 7):
+   print("require minimum python version 3.7 or later")
+   sys.exit(0)
+
+noAmdSmi = False
+
+try:
+  from amdsmi import *
+except ImportError:
+   noAmdSmi = True
+
+
+def get_amd_smi_static_numa():
+    """
+    get the output of 'amd-smi static --numa' to extract GPU affinity
+    and NUMA node information, storing them in arrays indexed by GPU number.
+    """
+    gpu_affinity = []
+    numa_node = []
+    hip_uuid = []
+    gpu_id = 0
+
+    amdsmi_init()
+
+    try:
+       devices = amdsmi_get_processor_handles()
+       node_number = 0
+       affi_node = 0
+       if len(devices) == 0:
+          print("No GPUs on machine")
+          sys.extit(1)
+       for device in devices:
+          info = amdsmi_get_gpu_enumeration_info(device)
+          node_number = amdsmi_topo_get_numa_node_number(device)
+          if debug_numa > 2: print("****");print("gpu_id: ", gpu_id);print("Numa: ",node_number)
+          # Ensure lists are large enough to accommodate the GPU ID
+          while len(numa_node) <= gpu_id:
+             numa_node.append(None)
+          numa_node[gpu_id] = node_number
+          while len(hip_uuid) <= gpu_id:
+             hip_uuid.append(None)
+          hip_uuid[gpu_id] = info['hip_uuid']
+          if debug_numa > 2: print("hip_id: ", info['hip_id']); print("hip_uuid: ", info['hip_uuid'])
+
+          try:
+            affi_node = amdsmi_get_gpu_topo_numa_affinity(device)
+            if affi_node == -1: affi_node = node_number
+            if debug_numa > 2: print("Affinity: ", affi_node)
+          except AmdSmiException as e:
+            if debug_numa > 2: print("N/A")
+
+          # Ensure lists are large enough to accommodate the GPU ID
+          while len(gpu_affinity) <= gpu_id:
+             gpu_affinity.append(None)
+          gpu_affinity[gpu_id] = affi_node
+          gpu_id += 1
+    except AmdSmiException as e:
+      printr(f"Error executing amd-smi: {e}")
+
+    return gpu_affinity, numa_node, hip_uuid
+
+def parse_rocm_smi_toponuma():
+    """
+    Parses the output of 'rocm-smi --showtoponuma' to extract GPU affinity
+    and NUMA node information, storing them in arrays indexed by GPU number.
+    """
+    try:
+        # Execute the rocm-smi command
+        UIresult = subprocess.run(['rocm-smi', '--showuniqueid'], capture_output=True, text=True, check=True)
+        UIoutput = UIresult.stdout
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing rocm-smi: {e}")
+        return None, None, None
+    except FileNotFoundError:
+        print("Error: 'rocm-smi' command not found. Ensure ROCm is installed and in your PATH.")
+        return None, None, None
+
+    hip_uuid = []
+    patternUI = re.compile(r"GPU\[(\d+)\]\s+:\s+Unique\s+ID:\s+0x([0-9a-fA-F]+)")
+    for line in UIoutput.splitlines():
+        match = patternUI.search(line)
+        if match:
+            gpu_id = int(match.group(1))
+            UUID = match.group(2)
+            while len(hip_uuid) <= gpu_id:
+               hip_uuid.append(None)
+            hip_uuid[gpu_id] = "GPU-"+UUID
+
+    try:
+        # Execute the rocm-smi command
+        result = subprocess.run(['rocm-smi', '--showtoponuma'], capture_output=True, text=True, check=True)
+        output = result.stdout
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing rocm-smi: {e}")
+        return None, None
+    except FileNotFoundError:
+        print("Error: 'rocm-smi' command not found. Ensure ROCm is installed and in your PATH.")
+        return None, None, None
+
+    gpu_affinity = []
+    numa_node = []
+
+    # Regex to find lines containing GPU information (e.g., "GPU[0-9]: Affinity: [0-9]+, Node: [0-9]+")
+    patternAffy = re.compile(r"GPU\[(\d+)\]\s+:\s+\(Topology\) Numa Affinity:\s+(\d+)")
+    patternErrA = re.compile(r"get_numa_affinity_topology, Not supported on the given system")
+    patternNode = re.compile(r"GPU\[(\d+)\]\s+:\s+\(Topology\) Numa Node:\s+(\d+)")
+
+    for line in output.splitlines():
+        match = patternAffy.search(line)
+        if match:
+            gpu_id = int(match.group(1))
+            affinity = int(match.group(2))
+            # Ensure lists are large enough to accommodate the GPU ID
+            while len(gpu_affinity) <= gpu_id:
+                gpu_affinity.append(None)
+
+            gpu_affinity[gpu_id] = affinity
+        match = patternNode.search(line)
+        if match:
+            gpu_id = int(match.group(1))
+            node = int(match.group(2))
+
+            # Ensure lists are large enough to accommodate the GPU ID
+            while len(numa_node) <= gpu_id:
+                numa_node.append(None)
+
+            numa_node[gpu_id] = node
+        #cpx tpx etc are missing affinity info, fix it here
+        match = patternErrA.search(line)
+        if match:
+            # Ensure lists are large enough to accommodate the GPU ID
+            while len(gpu_affinity) <= gpu_id:
+                gpu_affinity.append(None)
+
+            #use previous gpu_affinity
+            gpu_affinity[gpu_id] = affinity
+    return gpu_affinity, numa_node, hip_uuid
+
+
+def parse_lscpu_numa():
+# get lscpu numa info
+#    NUMA node0 CPU(s):                       0-7
+#    NUMA node1 CPU(s):                       8-15
+    try:
+        # Execute the rocm-smi command
+        result = subprocess.run(['lscpu'], capture_output=True, text=True, check=True)
+        output = result.stdout
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing lscpu: {e}")
+        return None
+    except FileNotFoundError:
+        print("Error: 'lscpu' command not found.")
+        return None
+
+    numa_cpus = []
+    patternLSCPU = re.compile(r"NUMA node(\d+) CPU\(s\):\s+([\d,-]+)")
+
+    if debug_numa > 2:print("NUMA CPUs:")
+    for line in output.splitlines():
+        match = patternLSCPU.search(line)
+        if match:
+            numa_id = int(match.group(1))
+            cpus = match.group(2)
+            if debug_numa > 2:print("  numa cores:", numa_id, cpus)
+            # Ensure lists are large enough to accommodate the GPU ID
+            while len(numa_cpus) <= numa_id:
+                numa_cpus.append(None)
+            numa_cpus[numa_id] = cpus
+    return numa_cpus
+
+def check_numactl_exists():
+    try:
+        subprocess.run(['numactl', '--version'], check=True, 
+                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    # numactl command not found in PATH
+    except FileNotFoundError: return False
+    except subprocess.CalledProcessError: return True
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False
+
+def check_taskset_exists():
+    try:
+        subprocess.run(['taskset', '--version'], check=True, 
+                       stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return True
+    # taskset command not found in PATH
+    except FileNotFoundError: return False
+    except subprocess.CalledProcessError: return True
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+        return False
+
+def helpExit(exCode):
+    if exCode == 1: print("Error: nothing to bind")
+    print("Usage: gpurun [gpurun_options] Program and options")
+    print("  -h --help : display help test")
+    print("  -v        : display gpurun command")
+    print("  -vv       : display additional debug info")
+    print("  -vvv      : display more debug info")
+    print("  -dryrun   : do not run bindings")
+    print("  -taskset  : use taskset for binding")
+    print("  -numatcl  : use numactl for binding [default]")
+    print("  -md       : Set number of desired devices for multi-device mode, default=1")
+    print("  -nr       : use numactl ROCR_VISIBLE_DEVICES")
+    print("  -nm       : use numactl OMPI_COMM_WORLD_LOCAL_RANK")
+    print("  -topo     : display the topology and exit")
+    print("  -rocmsmi  : force use of rocm-smi rather than amd-smi")
+    print("  --version : Print version of gpurun and exit")
+    print("")
+    sys.exit(exCode)
+    # still to do
+    #  -m   use numactl membind to CPUs in same NUMA domain. Note: Allocation
+    #       fails when not enough memory available on these nodes.
+    #  -l   use numactl localalloc to CPUs in same NUMA domain. Note: If
+    #       memory cannot be allocated, alloc falls back to other nodes.
+    #  support GPU-xxxxxxxx
+
+def processArgs():
+    sysPos=1
+    debug_numa=0
+    use_taskset=False
+    use_numactl=True
+    use_nobind=False
+    use_nr=False
+    use_nm=False
+    use_md=False
+    use_rocmsmi=False
+    md_count=1
+
+    dump_topo=False
+    dry_run=False
+    skip_args = ["-s", "-q", "-m", "-l" ]
+    # loop over bind arguments
+    while True:
+      if len(sys.argv[sysPos:]) == 0: helpExit(1)
+      if   sys.argv[sysPos] == "-v": debug_numa=1
+      elif sys.argv[sysPos] == "-vv":  debug_numa=2
+      elif sys.argv[sysPos] == "-vvv": debug_numa=3
+      elif sys.argv[sysPos] in ["-h", "--help"]: helpExit(0)
+      elif sys.argv[sysPos] == "--version": print("Version: 22.0.0"); sys.exit(0)
+      elif sys.argv[sysPos] == "-dryrun": dry_run=True
+      elif sys.argv[sysPos] == "-taskset": use_taskset=True; use_numactl=False; use_nobind=False
+      elif sys.argv[sysPos] == "-numactl": use_numactl=True; use_taskset=False; use_nobind=False
+      elif sys.argv[sysPos] == "-nobind":  use_nobind=True;  use_taskset=False; use_numactl=False
+      elif sys.argv[sysPos] == "-topo":  dump_topo=True; break
+      elif sys.argv[sysPos] == "-nr":  use_nr=True
+      elif sys.argv[sysPos] == "-nm":  use_nm=True
+      elif sys.argv[sysPos] == "-rocmsmi":  use_rocmsmi=True
+      elif sys.argv[sysPos] == "-md":
+          use_md=True
+          if sys.argv[sysPos+1].isdigit():
+              md_count=int(sys.argv[sysPos+1])
+              sysPos += 1
+      #to be implimented GPURUN options
+      elif sys.argv[sysPos] in skip_args: skipped_args=True
+      else: break
+      sysPos += 1
+
+    return sysPos, debug_numa, use_taskset, use_numactl, use_nobind, dry_run, use_md, md_count, use_nr, use_nm, dump_topo, use_rocmsmi
+
+def dumpTopology(affinity_data, node_data, hip_uuid, numa_cpus):
+    numaStat="<unknown>"
+    pageSize="<unknown>"
+    with open('/proc/sys/kernel/numa_balancing', 'r') as f: numaStat = f.read()
+    with open('/sys/kernel/mm/transparent_hugepage/enabled', 'r') as f: pageSize = f.read()
+    Tb="\t"
+    print("Topology     Numa: "+numaStat.strip()+"   PageSize: "+pageSize.strip()+"\n\nGPU     Node  Affinity       UUID               Cores")
+    for i in range(len(node_data)):
+      print(i, Tb, node_data[i], Tb, affinity_data[i], Tb, hip_uuid[i], Tb, numa_cpus[affinity_data[i]])
+    sys.exit(0)
+
+if __name__ == "__main__":
+    sysPos, debug_numa, use_taskset, use_numactl, use_nobind, dry_run, use_md, md_count, use_nr, use_nm, dump_topo, use_rocmsmi = processArgs()
+    #check for numactl and taskset
+    has_numactl = check_numactl_exists()
+    has_taskset = check_taskset_exists()
+
+    #get topo info
+    if use_taskset or dump_topo: numa_cpus = parse_lscpu_numa()
+    if noAmdSmi or use_rocmsmi:
+       affinity_data, node_data, hip_uuid = parse_rocm_smi_toponuma()
+    else:
+       affinity_data, node_data, hip_uuid = get_amd_smi_static_numa()
+
+    if debug_numa > 1: print(affinity_data, node_data, hip_uuid)
+    if dump_topo: dumpTopology(affinity_data, node_data, hip_uuid, numa_cpus)
+
+    numGpus = len(node_data)
+    rocrVisDev = int(os.environ.get('ROCR_VISIBLE_DEVICES', '-1'))
+    localRank = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_RANK', '0'))
+    numRanksLocal = int(os.environ.get('OMPI_COMM_WORLD_LOCAL_SIZE', '1'))
+
+    # support override by envvar
+    gpurun_bypass = int(os.environ.get('GPURUN_BYPASS', '0'))
+    if gpurun_bypass:
+       use_taskset = False
+       use_numactl = False
+       use_nobind = True
+
+    if rocrVisDev != -1 or use_nr:
+       adjRank = rocrVisDev
+    elif use_nm or use_numactl:
+       adjRank = localRank % numGpus
+    else:
+       adjRank=0
+    if debug_numa > 1:
+      print("#GPUs ", numGpus, "numRanks", numRanksLocal, "localRank", localRank, "adjRank", adjRank, "RVD", rocrVisDev)
+      if debug_numa > 2:
+        if affinity_data is not None and node_data is not None:
+          print("\nGPU Affinity:")
+          for i, affinity in enumerate(affinity_data):
+            if affinity is not None:
+                print(f"  GPU {i}: Affinity = {affinity}")
+
+          print("\n GPU NUMA Nodes:")
+          for i, node in enumerate(node_data):
+            if node is not None:
+                print(f"  GPU {i}: NUMA Node = {node}")
+
+    my_env = os.environ.copy()
+    if use_md:
+       my_env["ROCR_VISIBLE_DEVICES"] = "0,1"
+    else:
+       my_env["ROCR_VISIBLE_DEVICES"] = str(adjRank)
+    if use_taskset and has_taskset:
+       program_to_run = [ "taskset", "-c", numa_cpus[node_data[adjRank]]]
+    elif use_numactl and has_numactl:
+       program_to_run = [ "numactl", "--cpunodebind", str(node_data[adjRank]), "--membind", str(affinity_data[adjRank]) ]
+    elif use_nobind:
+       program_to_run = [ ]
+    else:
+       program_to_run = [ ]
+    program_to_run.extend(sys.argv[sysPos:])
+    if debug_numa > 0 or dry_run: print("ROCR_VISIBLE_DEVICES", my_env["ROCR_VISIBLE_DEVICES"], " ", program_to_run)
+    if not dry_run: result = subprocess.run(program_to_run, env=my_env, capture_output=False, text=False, check=False)
diff --git a/offload/utils/gpurun/gpurun-old b/offload/utils/gpurun/gpurun-old
new file mode 100755
index 0000000000000..870bc7a8ccbcd
--- /dev/null
+++ b/offload/utils/gpurun/gpurun-old
@@ -0,0 +1,697 @@
+#!/bin/bash
+# Copyright(C) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+# of the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+#  gpurun: Process launch utility for GPU applications. This is a wrapper
+#          to execute application binaries including OpenMPI GPU applications.
+#          See help message below (gpurun -h) for more information.
+#
+#  Usage Examples:
+#    gpurun true
+#    mpirun -np  4 gpurun env | grep ROCR_VISIBLE_DEVICES
+#
+
+# If set to 1, just invoke the rest of the command line without doing anything
+# else.
+GPURUN_BYPASS=${GPURUN_BYPASS:-0}
+
+function execOnError() {
+   exec "$@"
+}
+
+# PROGVERSION string is updated by cmake when component is installed
+PROGVERSION=X.Y-Z
+function version(){
+   echo $0 version $PROGVERSION
+   exit 0
+}
+function usage(){
+/bin/cat 2>&1 <<"EOF"
+
+   gpurun: Application process launch utility for GPUs.
+           This utility ensures the process will enable either a single
+	   GPU or the number specified with -md (multi-device) option.
+           It launches the application binary with either the 'taskset'
+           or 'numactl' utility so the process only runs on CPU cores
+           in the same NUMA domain as the selected GPUs.
+           This utility sets environment variable ROCR_VISIBLE_DEVICES
+	   to selected GPUs ONLY if it was not already set by the
+	   callers environment AND the number of GPUs is not 1.
+           This utility also sets environment variable HSA_CU_MASK
+           to control which CUs are available to the process.
+	   HSA_CU_MASK is set only when more than one OpenMPI process
+	   (rank) will utilize the same GPU and it is not preset.
+           Lastly, it sets env variable OMPX_TARGET_TEAM_PROCS to the
+           number of CUs available to the process after masking.
+
+   Usage:
+      gpurun <executable> [ <executable args> ]
+      mpirun -np <num ranks>  gpurun <executable> [ <executable args> ]
+
+   Options:
+      -h   Print this help message and exit
+      -md  Set number of desired devices for multi-device mode, default=1
+      -s   suppress output, often useful in benchmarking
+      -q   suppress output, quiet, alias of -s, same as GPURUN_VERBOSE=0
+      -v   Verbose output, same as GPURUN_VERBOSE=1
+      -vv  Verbose output, same as GPURUN_VERBOSE=2
+      -m   use numactl membind to CPUs in same NUMA domain. Note: Allocation
+           fails when not enough memory available on these nodes.
+      -l   use numactl localalloc to CPUs in same NUMA domain. Note: If
+           memory cannot be allocated, alloc falls back to other nodes.
+      -nr  use numactl ROCR_VISIBLE_DEVICES
+      -nm  use numactl OMPI_COMM_WORLD_LOCAL_RANK
+      --version Print version of gpurun and exit
+
+   Optional Input environment variables:
+      GPURUN_VERBOSE
+        0:  default for silent operation, no trace printed to stderr
+        1:  -v prints trace record including process launch cmd to stderr
+        2:  -vv prints trace and other summary diagnostics
+      ROCMINFO_BINARY  Set location of rocminfo binary
+      AOMP: location of AOMP or ROCM
+      GPURUN_DEVICE_BIAS: amount to shift device number to avoid dev 0.
+                          This only works for single device mode.
+      GPURUN_VISIBLE_DEVICE_TYPES: useful if machine has different GPU cards
+      GPURUN_MASK_POLICY : useful if machine has different GPU cards
+      ROCR_VISIBLE_DEVICES: See description above
+      OMPI_COMM_WORLD_LOCAL_SIZE Number of ranks on this node set by openmpi
+      OMPI_COMM_WORLD_LOCAL_RANK The local rank number 0-(nranks-1) from openmpi
+      This also checks for MPI_LOCALNRANKS/MPI_LOCALRANKID
+      and MPI_COMM_WORLD_LOCAL_SIZE/MPI_COMM_WORLD_LOCAL_RANK
+
+   Generated (output) Environment Variables:
+      OMPX_TARGET_TEAM_PROCS - Number of CUs available to process
+      ROCR_VISIBLE_DEVICES - list of GPU Uuids for the selected devices if not preset
+      HSA_CU_MASK - The CU mask for the device.
+      LIBOMPTARGET_NUM_MULTI_DEVICES - the value set by -md argument
+      GPU_MAX_HW_QUEUES
+      LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES"
+
+   Limitations:
+   - Currently, gpurun creates masks that are mutually exclusive of each other.
+     That is, the MPI processes will not share CUs. If number of ranks is not
+     perfectly divisible by number of CUs or number of GPUs, some resources
+     would be unused.
+     Set GPURUN_VERBOSE=1 or 2 to see overall cu utilization.
+   - Works with AOMP 19.0-0 or ROCM 6.1 or greater
+   - cu masking is not available when multiple devices per process are enabled
+     with -md option (multi-device) mode.
+
+   Notes:
+     With MPI, this utility distributes GPUs and their CUs across
+     multiple ranks of an MPI job into mutually exclusive sets of CUs.
+     It uses OpenMPI environment variables OMPI_COMM_WORLD_LOCAL_SIZE
+     and OMPI_COMM_WORLD_LOCAL_RANK to set visible devices and a
+     the mutually exclusive CU mask.
+
+     An rplace (rank place) is a subset of CUs for a rank. 
+     gpurun calculates the number of rplaces needed to contain all
+     the specified number of ranks for this node. If number of ranks not
+     divisible by number of GPUs, then there will be more rplaces than ranks.
+     The number of CUs in an rplace is calculated by dividing the number of
+     CUs per GPU by the number of rplaces per GPU. This is also the number of
+     bits set in the CU mask. This is also the number of physical locations
+     available for an OpenMP team to execute. This utility exports that number
+     to the environment variable OMPX_TARGET_TEAM_PROCS. This value
+     could be used by the application or runtume to adjust the number
+     of desired teams in a target region. If no masking occurs, the entire
+     GPU is available for the process and OMPX_TARGET_TEAM_PROCS is set to
+     the total number of CUs on the GPU.
+
+   Copyright (c) 2024  ADVANCED MICRO DEVICES, INC.
+
+EOF
+  exit 0
+}
+
+_end_gpurun_opts=0
+_devices_per_mdset=1
+_uses_multi_device=0
+while [ "$_end_gpurun_opts" == "0"  ] ; do
+   case "$1" in
+      -s)          GPURUN_VERBOSE=0;;
+      -q)          GPURUN_VERBOSE=0;;
+      --quiet)     GPURUN_VERBOSE=0;;
+      -h)          usage ;;
+      -help)       usage ;;
+      --help)      usage ;;
+      -version)    version ;;
+      --version)   version ;;
+      -v)          GPURUN_VERBOSE=1;;
+      -vv)         GPURUN_VERBOSE=2;;
+      -m)          _use_numactl_membind=1;;
+      -md)         shift; _devices_per_mdset=$1; _uses_multi_device=1;;
+      -nr)          _use_numactl_rocr=1;;
+      -nm)          _use_numactl_ompi=1;;
+      -l)          _use_numactl_localalloc=1;;
+      -nomask)     GPURUN_MASK_POLICY="nomask";;
+      *)           _end_gpurun_opts=1; break;;
+   esac
+   if [ "$_end_gpurun_opts" == "0" ] ; then
+     shift
+   fi
+done
+
+if  [ "$GPURUN_BYPASS" = "1" ]; then
+  execOnError "$@"
+fi
+
+# Default: quiet operation
+GPURUN_VERBOSE=${GPURUN_VERBOSE:-0}
+# Default: create mutually exclusive sets of CUs when GPU is oversubscribed
+GPURUN_MASK_POLICY=${GPURUN_MASK_POLICY:-mutex}
+# switch mask policy to preset if HSA_CU_MASK was preset
+[[ ! -z "$HSA_CU_MASK" ]] && GPURUN_MASK_POLICY=preset
+# switch mask policy to nomask for multi-device
+[[ $_uses_multi_device == 1 ]] && GPURUN_MASK_POLICY=nomask
+# Offset selected device to avoid some heavily used GPUs
+GPURUN_DEVICE_BIAS=${GPURUN_DEVICE_BIAS:-0}
+
+#  Get environment variables set by OpenMPI
+_num_local_ranks=$OMPI_COMM_WORLD_LOCAL_SIZE
+_local_rank_num=$OMPI_COMM_WORLD_LOCAL_RANK
+# If not OpenMPI, check for Platform MPI, MVAPICH
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=$MPI_LOCALNRANKS
+   _local_rank_num=$MPI_LOCALRANKID
+fi
+# Also try MPI_COMM_WORLD env vars
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=$MPI_COMM_WORLD_LOCAL_SIZE
+   _local_rank_num=$MPI_COMM_WORLD_LOCAL_RANK
+fi
+# Check if SLURM was used
+if [ -z "$_num_local_ranks" ] && [ ! -z $SLURM_CPUS_ON_NODE ] ; then
+   _num_local_ranks=$SLURM_CPUS_ON_NODE
+   _local_rank_num=$SLURM_LOCALID
+fi
+
+if [ "$_use_numactl_rocr"  == "1" ] ; then
+  _cmd_binary=`which numactl`
+  if [ $? == 0 ] ; then
+    numactl --cpunodebind $ROCR_VISIBLE_DEVICES  --membind $ROCR_VISIBLE_DEVICES $*
+    exit $?
+  else
+    $*
+    exit $?
+  fi
+fi
+if [ "$_use_numactl_ompi" == "1" ] ; then
+  _cmd_binary=`which numactl`
+  if [ $? == 0 ] ; then
+    numactl --cpunodebind $OMPI_COMM_WORLD_LOCAL_RANK  --membind $OMPI_COMM_WORLD_LOCAL_RANK $*
+    exit $?
+  else
+    $*
+    exit $?
+  fi
+fi
+# If none of the above MPIs, assume gpurun is wrapper for single process on single GPU
+if [ -z "$_num_local_ranks" ] ; then
+   _num_local_ranks=1
+   _local_rank_num=0
+fi
+
+# Find location of the rocminfo binary
+AOMP=${AOMP:-_AOMP_INSTALL_DIR_}
+if [ ! -d $AOMP ] ; then
+   AOMP="_AOMP_INSTALL_DIR_"
+fi
+if [ ! -d $AOMP ] ; then
+   AOMP="/opt/rocm/lib/llvm"
+fi
+if [ ! -d $AOMP ] ; then
+   AOMP="/opt/rocm/llvm"
+fi
+if [ ! -d $AOMP ] ; then
+   realpath=`realpath $0`
+   thisdir=`dirname $realpath`
+   AOMP=$thisdir/..
+fi
+if [ ! -d $AOMP ] ; then
+   >&2 echo "ERROR: AOMP not found at $AOMP"
+   >&2 echo "       Please install AOMP or correctly set env-var AOMP"
+   execOnError "$@"
+fi
+ROCMINFO_BINARY=${ROCMINFO_BINARY:-$AOMP/bin/rocminfo}
+[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../bin/rocminfo
+[ ! -f $ROCMINFO_BINARY ] && ROCMINFO_BINARY=$AOMP/../../bin/rocminfo
+if [ ! -f $ROCMINFO_BINARY ] ; then
+   >&2 echo "ERROR: Could not find binary for rocminfo,"
+   >&2 echo "       Please correct installation of ROCM or AOMP compiler"
+   execOnError "$@"
+fi
+
+# Use rocminfo to find number number of CUs and gfxids for each GPU.
+_tfile="/tmp/rinfo_out$$"
+$ROCMINFO_BINARY 2>/dev/null | grep -E "    Name:| Compute Unit:| Device Type:| BDFID:| Uuid:" |grep -v generic >$_tfile
+_tfile_lines=`wc -l $_tfile | cut -d" " -f1`
+if [ $_tfile_lines == 0 ] ; then
+  >&2 echo "ERROR: $ROCMINFO_BINARY failed to find GPU devices"
+  rm $_tfile
+  execOnError "$@"
+fi
+# Create 3 _ri_ arrays by parsing rocminfo (ri), one array entry per device
+_ri_all_gfxids=""
+_ri_gfxids=()
+_ri_cucount=()
+_ri_bdfids=()
+_ri_dev_idx=()
+_ri_num_devices=0
+_last_cu_count=0
+_ri_uuid=()
+_last_device_type_was_gpu=0
+_device_type_preset=0
+_ri_num_all_devices=0
+[ ! -z $GPURUN_VISIBLE_DEVICE_TYPES ] && _device_type_preset=1
+while read _linepair ; do
+  _fieldvalue=`echo $_linepair | cut -d":" -f2`
+  _fieldtype=`echo $_linepair | cut -d":" -f1`
+  if [ $_fieldvalue == "CPU" ] ; then
+     _last_device_type_was_gpu=0
+  elif [ $_fieldvalue == "GPU" ] ; then
+     _last_device_type_was_gpu=1
+  elif [ "$_fieldtype" == "Uuid" ] ; then
+     _this_uuid=$_fieldvalue
+  elif [ "$_fieldtype" == "BDFID" ] ; then
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+        # _domain="$(echo "$_fieldvalue / (2^32)" | bc)"
+        _bus="$(echo "($_fieldvalue / (2^8)) % (2^8)" | bc)"
+        _devfn="$(echo "($_fieldvalue % (2^8))" | bc)"
+        _bdfidstr="$(printf "%.2x:%.2x" "$_bus" "$_devfn")"
+     fi
+  elif [ "$_fieldtype" == "Name" ] ; then
+     #  The device name field is last in rocminfo output, so we can create new _ri_ array entry
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+	_this_gfxid=`echo $_fieldvalue | cut -d'-' -f5`
+        ! [[ ${_ri_all_gfxids} == *"$_this_gfxid"* ]] && _ri_all_gfxids+=" $_this_gfxid"
+        _is_type_visible=1
+	if [ $_device_type_preset == 1 ] ; then
+           _is_type_visible=0
+           if [[ ${GPURUN_VISIBLE_DEVICE_TYPES} == *"$_this_gfxid"* ]] ; then
+	     _is_type_visible=1
+	   fi
+	fi
+        if [ $_is_type_visible == 1 ] ; then
+           _ri_gfxids+=( $_this_gfxid )
+           _ri_cucount+=( $_last_cu_count )
+           _ri_bdfids+=( $_bdfidstr )
+	   _ri_dev_idx+=( $_ri_num_all_devices )
+	   _ri_uuid+=( $_this_uuid )
+           _ri_num_devices=$(( $_ri_num_devices + 1 ))
+	fi
+        _ri_num_all_devices=$(( $_ri_num_all_devices + 1 ))
+     fi
+  else
+     # else the _fieldvalue was the number of CUs or GCPUs
+     if [[ $_last_device_type_was_gpu == 1 ]] ; then
+        _last_cu_count=$_fieldvalue
+     fi
+  fi
+done < $_tfile
+rm $_tfile
+
+if [ $_ri_num_devices == 0 ] ; then
+   if [ $_local_rank_num == 0 ] ; then
+      if [ $_device_type_preset == 1 ] ; then
+         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY of type $GPURUN_VISIBLE_DEVICE_TYPES."
+         >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
+      else
+         >&2 echo "ERROR: No amdgpu devices found by $ROCMINFO_BINARY"
+      fi
+      if [ ! -z $ROCR_VISIBLE_DEVICES ] ; then
+         >&2 echo "       ROCR_VISIBLE_DEVICES was preset to $ROCR_VISIBLE_DEVICES"
+         >&2 echo "       Consider unset ROCR_VISIBLE_DEVICES and let gpurun set it correctly."
+      fi
+      execOnError "$@"
+   else
+      execOnError "$@"
+   fi
+fi
+
+# Scan /sys/bus/pci/devices (_ss_) for amdgpu devices and store info in 6 per
+# device arrays indexed by device num. The arrays are _ss_cpulist _ss_bdfids,
+# _ss_numanode, _ss_uuid, _ss_gfxid, and _ss_cucount. Some information
+# (cucount, gfxid, dev_idx) must be copied from the _ri_ arrays built above
+# by scanning output from rocminfo.
+_sysdevdir="/sys/bus/pci/devices"
+_ss_num_devices=0
+_ss_cpulist=()
+_ss_bdfid=()
+_ss_numanode=()
+_ss_uuid=()
+_ss_gfxid=()
+_ss_cucount=()
+for _devid in `ls $_sysdevdir` ; do
+   if [ -f $_sysdevdir/$_devid/device ] ; then
+      _driver_name=`cat $_sysdevdir/$_devid/uevent | grep DRIVER | awk '{print $1}'`
+      if [ ! -z $_driver_name ] ; then
+         if [ $_driver_name  == "DRIVER=amdgpu" ] ; then
+            _numa_node=`cat $_sysdevdir/$_devid/numa_node`
+            [ "$_numa_node" == "-1" ] && _numa_node=0
+            _this_uuid=0
+	    if [ -f $_sysdevdir/$_devid/unique_id ] ; then
+               _this_uuid=`cat $_sysdevdir/$_devid/unique_id`
+	       if [ -z $_this_uuid ] ; then
+                  _this_uuid=0
+		  _has_unique_id_file=0
+	       else
+                  _this_uuid="GPU-$_this_uuid"
+		  _has_unique_id_file=1
+	       fi
+	    fi
+            _this_cpulist=`cat $_sysdevdir/$_devid/local_cpulist`
+	    _match_uuid_count=0
+	    for _ri_i in ${!_ri_bdfids[@]} ; do
+               _ss_value=$_this_uuid
+               _ri_value=${_ri_uuid[$_ri_i]}
+               if [ $_ss_value == $_ri_value ] ; then
+                  _match_uuid_count=$(( $_match_uuid_count + 1 ))
+	       fi
+	    done
+            # Search _ri_ arrays for matching uuids or matching bdfids.
+	    for _ri_i in ${!_ri_bdfids[@]} ; do
+	       if [ "$_has_unique_id_file" == "1" ] ; then
+                  _ss_value=$_this_uuid
+                  _ri_value=${_ri_uuid[$_ri_i]}
+               elif [ "${_ri_bdfids[$_ri_i]}" == "00:00" ]; then
+                  # Under Hyper-V, we may see a zero BDFID.  Fall back to UUID.
+                  _ss_value=$_devid
+                  _ri_value=$_devid
+	       else
+                  _ss_value=$_devid
+                  _ri_value="0000:${_ri_bdfids[$_ri_i]}.0"
+               fi
+               if [ $_ss_value == $_ri_value ] ; then
+	          if [ $_this_uuid == 0 ] || [ $_match_uuid_count -gt 1 ] ; then
+	             # Some GPUs do not have unique_id or TPX mode creates multiple
+		     # identical uuids, so use device index for RVD
+                     _ss_uuid+=( ${_ri_dev_idx[$_ri_i]} )
+		  else
+                     _ss_uuid+=( $_this_uuid )
+		  fi
+		  _ss_gfxid+=( ${_ri_gfxids[$_ri_i]} )
+		  _ss_cucount+=( ${_ri_cucount[$_ri_i]} )
+                  _ss_bdfid+=( $_devid )
+                  _ss_numanode+=( $_numa_node )
+                  _ss_cpulist+=( $_this_cpulist )
+                  _ss_num_devices=$(( $_ss_num_devices + 1 ))
+               fi
+            done
+         fi
+      fi
+   fi
+done
+
+if [[ $_ss_num_devices -lt 1  ]] ; then
+   if [ $_device_type_preset == 1 ] ; then
+      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir of type $GPURUN_VISIBLE_DEVICE_TYPES."
+      >&2 echo "       Set GPURUN_VISIBLE_DEVICE_TYPES to one of these types: ${_ri_all_gfxids}"
+   else
+      >&2 echo "ERROR: No amdgpu devices found in $_sysdevdir."
+   fi
+   execOnError "$@"
+fi
+
+# check for taskset or numactl cmd
+if [ "$_use_numactl_membind" == "1" ] || [ "$_use_numactl_localalloc" == "1" ] ; then
+  _launch_process_cmd_binary=`which numactl`
+  if [ $? != 0 ] ; then
+    >&2 echo "ERROR: The -m (membind) or -l (localalloc) require numactl to be installed."
+    execOnError "$@"
+  fi
+else
+  _launch_process_cmd_binary=`which taskset`
+  if [ $? != 0 ] ; then
+    >&2 echo "ERROR: $0 requires the taskset command to be installed."
+    execOnError "$@"
+  fi
+fi
+if [ "$_use_numactl_membind" == "1" ] && [ "$_use_numactl_localalloc" == "1" ] ; then
+  >&2 echo "GPURUN WARNING: When -l and -m are both set, -m is ignored."
+  _use_numactl_membind=0
+fi
+
+_utilized_devices=$_ri_num_devices
+[ $_ri_num_devices -gt $_num_local_ranks ] && _utilized_devices=$_num_local_ranks
+
+# Calculate number of GPUs to use to evenly spread ranks across GPUs.
+# An rplace is a set of CUs that will be used for a rank.
+# The number of rplaces must be at least the number of ranks.
+_uncovered_ranks=$(( $_num_local_ranks % $_utilized_devices ))
+_number_of_rplaces_per_GPU=$(( $_num_local_ranks / $_utilized_devices ))
+if [ $_uncovered_ranks != 0 ] ; then
+   # If _num_local_ranks not divisible by number of GPUs,
+   # then add an extra rplace per GPU to make room for remainder.
+   _number_of_rplaces_per_GPU=$(( $_number_of_rplaces_per_GPU + 1 ))
+fi
+if [ $GPURUN_MASK_POLICY == "mutex" ] ; then
+   # For mutex policy, adjacent ranks are assigned to the same device.
+   _rplace_num=$(( $_local_rank_num / $_number_of_rplaces_per_GPU ))
+   # Some users want to avoid dev 0 etc, by setting GPURUN_DEVICE_BIAS
+   _device_num=$(( ( $_rplace_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
+else
+   # for mask policies nomask or preset, adjacent ranks are assigned to
+   # different GPUs and oversubscribed ranks are assigned round robin
+   _device_num=$(( ( $_local_rank_num + $GPURUN_DEVICE_BIAS ) % $_ri_num_devices ))
+fi
+
+_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
+if [ $_num_local_ranks -gt $_node_cus ] ; then
+   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
+   execOnError "$@"
+fi
+
+if [ $_uses_multi_device == 1 ]; then
+   # Enforce some rules on the use of -md option
+   # Note -md forces GPURUN_MASK_POLICY=nomask
+   if [[ ! -z $ROCR_VISIBLE_DEVICES ]] ; then
+      >&2 echo "ERROR: DO NOT PRESET ROCR_VISIBLE_DEVICES in gpurun multi-device (-md) mode"
+      execOnError "$@"
+   fi
+   if [ $_devices_per_mdset -gt $_ri_num_devices ] ; then
+      >&2 echo "ERROR: More devices requested ($_devices_per_mdset) than available ($_ri_num_devices)"
+      execOnError "$@"
+   fi
+   _md_total_devices=$(( $_num_local_ranks * $_devices_per_mdset ))
+   if [ $_md_total_devices -gt $_ri_num_devices ] &&  [ $_local_rank_num == 0 ] ; then
+      printf "WARNING: processes($_num_local_ranks) * md set size($_devices_per_mdset) = $_md_total_devices > than available devices ($_ri_num_devices)\n         Some multi-device sets will overlap.\n" >&2
+   fi
+   _md_device_set_start=$(( ( $_local_rank_num * $_devices_per_mdset ) % $_ri_num_devices))
+   _md_device_set_end=$(( $_md_device_set_start + $_devices_per_mdset - 1 ))
+
+   # merge entries for this mdset from per device arrays
+   _md_bdfs=""
+   _md_cpus=""
+   _md_nns=""
+   _md_uuids=""
+   _md_dev_idxs=""
+   _sep=""
+   for i in `seq $_md_device_set_start $_md_device_set_end` ; do
+      _dev_index=$i
+      # handle index wrap around number of devices
+      [ $i -ge $_ri_num_devices ] && _dev_index=$(( $i % $_ri_num_devices ))
+      _md_bdfs+=$_sep${_ss_bdfid[$_dev_index]}
+      _new_nn=${_ss_numanode[$_dev_index]}
+      SAVEIFS=$IFS
+      IFS=","
+      _found=0
+      for _existing_nn in $_md_nns ; do
+         [ $_existing_nn == $_new_nn ] && _found=1
+      done
+      IFS=$SAVEIFS
+      if [ $_found == 0 ] ; then
+	 # only add new numa node and cpulist, if not already in the md set
+         _md_nns+=$_sep$_new_nn
+         _md_cpus+=$_sep${_ss_cpulist[$_dev_index]}
+      fi
+      _md_uuids+=$_sep${_ss_uuid[$_dev_index]}
+      _md_dev_idxs+=$_sep$_dev_index
+      _sep=","
+   done
+   _device_num=$_md_device_set_start
+fi
+
+_available_CUs_per_device=${_ss_cucount[$_device_num]}
+_gfxid=${_ss_gfxid[$_device_num]}
+
+_node_cus=$(( $_ri_num_devices * ${_ss_cucount[$_device_num]} ))
+if [ $_num_local_ranks -gt $_node_cus ] ; then
+   >&2 echo "ERROR: Not enough CUs ($_node_cus) for $_num_local_ranks ranks "
+   execOnError "$@"
+fi
+
+_utilized_CUs_per_device=$_available_CUs_per_device
+_rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
+# Lower utilized CUs till divisible by number of rplaces per GPU
+while [ $_rem2 != 0 ] ; do
+   _utilized_CUs_per_device=$(( $_utilized_CUs_per_device - 1 ))
+   _rem2=$(( $_utilized_CUs_per_device % $_number_of_rplaces_per_GPU ))
+done
+_CUs_per_rplace=$(( $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU ))
+
+# --- THIS BLOCK ONLY FOR VERBOSE DIAGS PRINTED FROM RANK 0
+if [ $_local_rank_num == 0 ] && [[ "$GPURUN_VERBOSE" == "2" ]]; then
+   if [ $_uses_multi_device == 0 ] ; then
+      _wasted_CUs_on_each_GPU=$(( $_available_CUs_per_device - $_utilized_CUs_per_device ))
+      _total_GPU_rplaces=$(( $_number_of_rplaces_per_GPU * $_ri_num_devices ))
+      _total_wasted_rplaces=$(( $_total_GPU_rplaces - $_num_local_ranks ))
+      _wasted_GPUs=$(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU ))
+      _used_cus=$(( $_num_local_ranks * $_CUs_per_rplace ))
+      _utilization=$(( ( $_used_cus * 100 ) / $_node_cus ))
+      if ! [ $_ri_num_devices -gt $_num_local_ranks ] ; then
+         if [ $_wasted_CUs_on_each_GPU != 0 ] || [ $_total_wasted_rplaces != 0 ] ; then
+            _extra_diags=true
+         fi
+      fi
+      >&2 echo "-  ROCMINFO LOCATION:   $ROCMINFO_BINARY"
+      >&2 echo "-  PROCESSES:           $_num_local_ranks (RANKS)"
+      >&2 echo "-  AVAILABLE GPUS:      $_ri_num_devices"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED GPUS:           $(( $_ri_num_devices - $_wasted_GPUs ))"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED GPUS:         $(( $_total_wasted_rplaces / $_number_of_rplaces_per_GPU )) "
+      [ $_extra_diags ] && echo
+      >&2 echo "-  RPLACEs PER NODE:    $_total_GPU_rplaces"
+      >&2 echo "-  RPLACEs PER GPU:     $_number_of_rplaces_per_GPU"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED RPLACEs:        $_num_local_ranks (RANKS)"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED RPLACEs:      $_total_wasted_rplaces" ; \
+      >&2 echo "-  gfxids               ${_ss_gfxid[@]}"
+      >&2 echo "-  CUs PER GPU:         ${_ss_cucount[@]}"
+      [ $_extra_diags ] && \
+      >&2 echo "-- USED on CUs RANK0:   $_utilized_CUs_per_device"
+      [ $_extra_diags ] && \
+      >&2 echo "-- UNUSED CUs RANK0 :   $_wasted_CUs_on_each_GPU"
+      >&2 echo "-  CUs per RPLACE RANK0:$_CUs_per_rplace (OMPX_TARGET_TEAM_PROCS)"
+      >&2 echo "-  FORMULA: OMPX_TARGET_TEAM_PROCS = $_utilized_CUs_per_device / $_number_of_rplaces_per_GPU"
+      if [[ ! -z "$ROCR_VISIBLE_DEVICES" ]] ; then
+         >&2 echo "-  Preset ROCR_VISIBLE_DEVICES:  $ROCR_VISIBLE_DEVICES"
+      fi
+      if [[ ! -z "$HSA_CU_MASK" ]] ; then
+         # node utilizatino could be incorrect with preset cumask.
+         >&2 echo "-  Preset HSA_CU_MASK: $HSA_CU_MASK"
+      else
+         >&2 echo "-  NODE UTILIZATION:  $_utilization %"
+      fi
+   else
+      >&2 echo "-  ROCMINFO LOCATION: $ROCMINFO_BINARY"
+      >&2 echo "-  PROCESSES:         $_num_local_ranks (RANKS)"
+      >&2 echo "-  AVAILABLE GPUS:    $_ri_num_devices"
+      >&2 echo "-  DEVS PER RANK:     $_devices_per_mdset"
+      >&2 echo "-  MULTI-DEVICE GPUS: $_md_total_devices (RANKS*DEVS-PER-RANK)"
+      _md_utilization=$(( $_md_total_devices * 100 / $_ri_num_devices ))
+      >&2 echo "-  NODE UTILIZATION:  $_md_utilization %"
+   fi
+fi
+#  --- END OF DIAGNOSTIC BLOCK
+
+if [ $_CUs_per_rplace != $_available_CUs_per_device ] && [ $GPURUN_MASK_POLICY == "mutex" ] ; then
+   #  Build the CU mask for this rank, bits_to_set = _CUs_per_rplace
+   _bits_to_set=$_CUs_per_rplace
+   #  This formula keeps adjacent ranks on same GPU which should be preferred
+   _bits_to_shift=$(( ( $_local_rank_num * $_bits_to_set) - ( _device_num * $_utilized_CUs_per_device) ))
+   # use bc because these values can be very large
+   _unshifted_bits=`echo "(2 ^ $_bits_to_set) - 1" | bc`
+   _mask=`echo "obase=16; $_unshifted_bits * (2 ^ $_bits_to_shift)" | bc`
+   # Calculate the number of leading zeros needed for this mask
+   _lz=$(( ( $_utilized_CUs_per_device / 4 ) - ${#_mask} + 1 ))
+   for i in `seq 1 $_lz` ; do
+      _mask="0$_mask"
+   done
+   _mask="0x$_mask"
+fi
+
+_launch_process_cmd=""
+if [ $_uses_multi_device == 0 ] ; then
+   # retrieve scanned info from per device arrays
+   _bdfidstrc=${_ss_bdfid[$_device_num]}
+   NUMANODE=${_ss_numanode[$_device_num]}
+   _list_of_cpu_cores=${_ss_cpulist[$_device_num]}
+   _this_uuid=${_ss_uuid[$_device_num]}
+else
+   # Use multi-device values
+   _bdfidstrc=$_md_bdfs
+   NUMANODE=$_md_nns
+   _list_of_cpu_cores=$_md_cpus
+   _this_uuid=$_md_uuids
+   _launch_process_cmd+="env LIBOMPTARGET_NUM_MULTI_DEVICES=$_devices_per_mdset "
+fi
+if [ "$_use_numactl_localalloc" == "1" ] ; then
+   _launch_process_cmd+="$_launch_process_cmd_binary --localalloc --cpunodebind=$NUMANODE"
+elif [ "$_use_numactl_membind" == "1" ] ; then
+   _launch_process_cmd+="$_launch_process_cmd_binary --membind=$NUMANODE --cpunodebind=$NUMANODE"
+else
+   _launch_process_cmd+="$_launch_process_cmd_binary -c $_list_of_cpu_cores"
+fi
+
+# If gpurun was not given command to execute, then dont run _launch_process_cmd
+[ "$*" == "" ] && _launch_process_cmd=""
+
+# only set ROCR_VISIBLE_DEVICES if not already set
+if [[ -z $ROCR_VISIBLE_DEVICES ]] ; then
+   export ROCR_VISIBLE_DEVICES=$_this_uuid
+   _log_word="RVD"
+else
+   _log_word="PRESET-RVD"
+fi
+
+export OMPX_TARGET_TEAM_PROCS=$_CUs_per_rplace
+
+#  - Limit HSA queues when multiple ranks per GPU
+if [ $_number_of_rplaces_per_GPU != 1 ] ; then
+   # Only set these env controls if not set by caller
+   [[ -z "$GPU_MAX_HW_QUEUES" ]] && export GPU_MAX_HW_QUEUES=1
+   [[ -z "$LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES" ]] && export LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES=1
+fi
+
+[[ ! -z "$HSA_CU_MASK" ]] && [[ "$GPURUN_VERBOSE" != "0"  ]] && \
+   [[ $_local_rank_num == 0 ]] && >&2 echo "WARNING: preset HSA_CU_MASK:$HSA_CU_MASK"
+
+if [ $_CUs_per_rplace == $_available_CUs_per_device ] || [ "$GPURUN_MASK_POLICY" == "nomask" ] ; then
+   # --- HSA_CU_MASK is NOT USED in this code block, This code block covers all multi-device execution.
+   if [ "$GPURUN_VERBOSE" != "0" ] ; then
+      if [ $_uses_multi_device == 1 ] ; then
+         printf "RANK:$_local_rank_num D:$_md_dev_idxs NNs:$_md_nns GPUTYPE:$_gfxid $_log_word:$ROCR_VISIBLE_DEVICES\n     CMD:$_launch_process_cmd $*\n" >&2
+      else
+         printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d GPUTYPE:$_gfxid $_log_word:%s \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE $ROCR_VISIBLE_DEVICES "$_launch_process_cmd" >&2
+      fi
+   fi
+   $_launch_process_cmd $*
+   # --- end code block
+else
+   # --- HSA_CU_MASK is required in this code block, assumes no multi-device
+   if [[ -z "$HSA_CU_MASK" ]] ; then
+      # Since ROCR_VISIBLE_DEVICES only enables 1 GPU, HSA_CU_MASK starts with 0:
+      export HSA_CU_MASK=0:$_mask
+   else
+      # use preset mask
+      _mask=$HSA_CU_MASK
+   fi
+   if [ "$GPURUN_VERBOSE" != "0" ] ; then
+      printf "RANK:$_local_rank_num D:%d PCI:%5s NN:%d $_gfxid CUMASK:$_mask $_log_word:$ROCR_VISIBLE_DEVICES \n     CMD:%s $*\n" $_device_num $_bdfidstrc $NUMANODE "$_launch_process_cmd" >&2
+   fi
+   HSA_CU_MASK=0:$_mask \
+   $_launch_process_cmd $*
+   # --- end code block
+fi
+exit $?