diff --git a/compass/landice/tests/greenland/decomposition_test/__init__.py b/compass/landice/tests/greenland/decomposition_test/__init__.py index 06930965bd..80734423b0 100644 --- a/compass/landice/tests/greenland/decomposition_test/__init__.py +++ b/compass/landice/tests/greenland/decomposition_test/__init__.py @@ -1,13 +1,26 @@ -from compass.validate import compare_variables -from compass.testcase import TestCase from compass.landice.tests.greenland.run_model import RunModel +from compass.parallel import get_available_parallel_resources +from compass.testcase import TestCase +from compass.validate import compare_variables class DecompositionTest(TestCase): """ - A test case for performing two MALI runs of the Greenland Ice Sheet setup, - one with one core and one with eight. The test case verifies that the - results of the two runs are identical. + A test case for performing two MALI runs of the Greenland Ice Sheet setup + with different decompositions. The larger decomposition targets up to 32 + tasks, subject to available resources, and the smaller decomposition is + roughly half of the larger one. + + Attributes + ---------- + velo_solver : str + The velocity solver used for the test case + + proc_list : list of int + The pair of processor counts used in the decomposition comparison + + run_dirs : list of str + The names of the subdirectories for the two decomposition runs """ def __init__(self, test_group, velo_solver): @@ -24,25 +37,46 @@ def __init__(self, test_group, velo_solver): """ name = 'decomposition_test' self.velo_solver = velo_solver + self.proc_list = None + self.run_dirs = None subdir = '{}_{}'.format(velo_solver.lower(), name) super().__init__(test_group=test_group, name=name, subdir=subdir) - if velo_solver == 'sia': - self.cores_set = [1, 8] - elif velo_solver == 'FO': - self.cores_set = [16, 32] + def configure(self): + """ + Choose decomposition sizes from framework-detected resources and add + run steps. + + The larger decomposition targets up to 32 tasks. FO runs require at + least 10 tasks; SIA runs require at least 2 tasks. + """ + available_resources = get_available_parallel_resources(self.config) + target_max_tasks = 32 + if self.velo_solver == 'FO': + smallest_acceptable_max_tasks = 10 + elif self.velo_solver == 'sia': + smallest_acceptable_max_tasks = 2 else: - raise ValueError('Unexpected velo_solver {}'.format(velo_solver)) + raise ValueError(f'Unexpected velo_solver {self.velo_solver}') + + max_tasks = max( + smallest_acceptable_max_tasks, + min(target_max_tasks, available_resources['cores'])) + low_tasks = max(1, max_tasks // 2) + self.proc_list = [low_tasks, max_tasks] - for procs in self.cores_set: + self.run_dirs = [] + for procs in self.proc_list: name = '{}proc_run'.format(procs) + if name in self.run_dirs: + name = '{}_{}'.format(name, len(self.run_dirs) + 1) + self.run_dirs.append(name) self.add_step( - RunModel(test_case=self, velo_solver=velo_solver, name=name, + RunModel(test_case=self, velo_solver=self.velo_solver, + name=name, subdir=name, ntasks=procs, min_tasks=procs, openmp_threads=1)) - # no configure() method is needed - # no run() method is needed def validate(self): @@ -50,8 +84,8 @@ def validate(self): Test cases can override this method to perform validation of variables and timers """ - name1 = '{}proc_run'.format(self.cores_set[0]) - name2 = '{}proc_run'.format(self.cores_set[1]) + name1 = self.run_dirs[0] + name2 = self.run_dirs[1] if self.velo_solver == 'sia': compare_variables(test_case=self, variables=['thickness', 'normalVelocity'], diff --git a/compass/landice/tests/humboldt/decomposition_test/__init__.py b/compass/landice/tests/humboldt/decomposition_test/__init__.py index 1def33a56d..58cf7703eb 100644 --- a/compass/landice/tests/humboldt/decomposition_test/__init__.py +++ b/compass/landice/tests/humboldt/decomposition_test/__init__.py @@ -1,15 +1,16 @@ from compass.landice.tests.humboldt.run_model import RunModel +from compass.parallel import get_available_parallel_resources from compass.testcase import TestCase from compass.validate import compare_variables class DecompositionTest(TestCase): """ - A test case for performing two MALI runs of a humboldt setup, one with one - core and one with four. The test case verifies that the results of the - two runs are identical or close to identical. The FO velocity solver is - not bit for bit across decompositions, so identical results are not - expected when it is used. + A test case for performing two MALI runs of a Humboldt setup with + different decompositions. The larger decomposition targets 32 tasks, + subject to available resources, and the smaller decomposition is roughly + half of the larger one. The test case verifies that results are identical + or close to identical. Attributes ---------- @@ -30,12 +31,15 @@ class DecompositionTest(TestCase): depth_integrated : bool Whether the (FO) velocity model is depth integrated + hydro : bool Whether to include subglacial hydrology - proc_list : list - The pair of processor count values to test over. - Function of velocity solver. + proc_list : list of int + The pair of processor counts used in the decomposition comparison + + run_dirs : list of str + The names of the subdirectories for the two decomposition runs """ def __init__(self, test_group, velo_solver, calving_law, mesh_type, @@ -78,6 +82,9 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, self.calving_law = calving_law self.damage = damage self.face_melt = face_melt + self.depth_integrated = depth_integrated + self.proc_list = None + self.run_dirs = None if hydro is not None: self.hydro = hydro else: @@ -99,23 +106,48 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, super().__init__(test_group=test_group, name=name, subdir=subdir) + def configure(self): + """ + Choose decomposition sizes from framework-detected resources and add + run steps. + + The larger decomposition targets up to 32 tasks. FO runs require at + least 10 tasks; all others require at least 2 tasks. + """ + available_resources = get_available_parallel_resources(self.config) + # Target a max of 32 tasks, but use fewer if not available. + target_max_tasks = 32 + # FO solver required more resources to be time-effective to run if self.velo_solver == 'FO': - self.proc_list = [16, 32] + smallest_acceptable_max_tasks = 10 else: - self.proc_list = [1, 32] + # Need at least 2 tasks to test decomposition. + smallest_acceptable_max_tasks = 2 + max_tasks = max( + smallest_acceptable_max_tasks, + min(target_max_tasks, available_resources['cores'])) + # Note: Failing when this many tasks are unavailable is + # desired behavior for decomposition testing. + + low_tasks = max(1, max_tasks // 2) + self.proc_list = [low_tasks, max_tasks] + + self.run_dirs = [] for procs in self.proc_list: name = '{}proc_run'.format(procs) + if name in self.run_dirs: + name = '{}_{}'.format(name, len(self.run_dirs) + 1) + self.run_dirs.append(name) self.add_step( RunModel(test_case=self, name=name, subdir=name, ntasks=procs, + min_tasks=procs, openmp_threads=1, velo_solver=self.velo_solver, calving_law=self.calving_law, damage=self.damage, face_melt=self.face_melt, - depth_integrated=depth_integrated, + depth_integrated=self.depth_integrated, hydro=self.hydro, - mesh_type=mesh_type)) - - # no configure() method is needed + mesh_type=self.mesh_type)) # no run() method is needed @@ -124,8 +156,8 @@ def validate(self): Test cases can override this method to perform validation of variables and timers """ - run_dir1 = '{}proc_run'.format(self.proc_list[0]) - run_dir2 = '{}proc_run'.format(self.proc_list[1]) + run_dir1 = self.run_dirs[0] + run_dir2 = self.run_dirs[1] var_list = ['thickness'] if self.velo_solver == 'sia': diff --git a/compass/landice/tests/humboldt/restart_test/__init__.py b/compass/landice/tests/humboldt/restart_test/__init__.py index dd68e73748..0761da3748 100644 --- a/compass/landice/tests/humboldt/restart_test/__init__.py +++ b/compass/landice/tests/humboldt/restart_test/__init__.py @@ -1,12 +1,13 @@ from compass.landice.tests.humboldt.run_model import RunModel +from compass.parallel import get_available_parallel_resources from compass.testcase import TestCase from compass.validate import compare_variables class RestartTest(TestCase): """ - A test case for performing two MALI runs of a humboldt setup, one full - run and one run broken into two segments with a restart. The test case + A test case for performing two MALI runs of a Humboldt setup, one full + run and one run broken into two segments with a restart. The test case verifies that the results of the two runs are identical. Attributes @@ -14,6 +15,9 @@ class RestartTest(TestCase): mesh_type : str The resolution or type of mesh of the test case + velo_solver : str + The velocity solver used for the test case + calving_law : str The calving law used for the test case @@ -23,6 +27,9 @@ class RestartTest(TestCase): face_melt : bool Whether to include face melting + target_ntasks : int + The preferred task count for restart runs before resource constraints + depth_integrated : bool Whether the (FO) velocity model is depth integrated @@ -70,6 +77,7 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, self.calving_law = calving_law self.damage = damage self.face_melt = face_melt + self.target_ntasks = 32 if hydro is not None: self.hydro = hydro else: @@ -92,7 +100,8 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, subdir=subdir) name = 'full_run' - step = RunModel(test_case=self, name=name, subdir=name, ntasks=32, + step = RunModel(test_case=self, name=name, subdir=name, + ntasks=self.target_ntasks, openmp_threads=1, velo_solver=velo_solver, calving_law=self.calving_law, damage=self.damage, @@ -117,7 +126,8 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, self.add_step(step) name = 'restart_run' - step = RunModel(test_case=self, name=name, subdir=name, ntasks=32, + step = RunModel(test_case=self, name=name, subdir=name, + ntasks=self.target_ntasks, openmp_threads=1, velo_solver=velo_solver, calving_law=self.calving_law, damage=self.damage, @@ -152,7 +162,23 @@ def __init__(self, test_group, velo_solver, calving_law, mesh_type, 'streams.restart.rst', out_name='streams.landice.rst') self.add_step(step) - # no configure() method is needed + def configure(self): + """ + Set restart-test task counts from framework-detected resources. + + The target task count is 32 when available. FO runs require at least + 10 tasks; other runs allow any positive task count. + """ + available_resources = get_available_parallel_resources(self.config) + + min_tasks = 10 if self.velo_solver == 'FO' else 1 + ntasks = max(min_tasks, + min(self.target_ntasks, + available_resources['cores'])) + + # Apply the same task count to both full and restart runs. + for step in self.steps.values(): + step.set_resources(ntasks=ntasks, min_tasks=min_tasks) # no run() method is needed diff --git a/compass/landice/tests/thwaites/decomposition_test/__init__.py b/compass/landice/tests/thwaites/decomposition_test/__init__.py index 22d4419133..1ef934f1cf 100644 --- a/compass/landice/tests/thwaites/decomposition_test/__init__.py +++ b/compass/landice/tests/thwaites/decomposition_test/__init__.py @@ -1,13 +1,27 @@ from compass.landice.tests.thwaites.run_model import RunModel +from compass.parallel import get_available_parallel_resources from compass.testcase import TestCase from compass.validate import compare_variables class DecompositionTest(TestCase): """ - A test case for performing two MALI runs of the Thwaites setup, - with two different core counts. The test case verifies that the - results of the two runs are identical. + A test case for performing two MALI runs of the Thwaites setup with + different decompositions. The larger decomposition targets up to 32 + tasks, subject to available resources, and the smaller decomposition is + roughly half of the larger one. The test case verifies that the results + of the two runs are identical. + + Attributes + ---------- + depth_integrated : bool + Whether the FO velocity model is depth integrated + + proc_list : list of int + The pair of processor counts used in the decomposition comparison + + run_dirs : list of str + The names of the subdirectories for the two decomposition runs """ def __init__(self, test_group, depth_integrated=False): @@ -28,19 +42,39 @@ def __init__(self, test_group, depth_integrated=False): else: name = 'fo_decomposition_test' + self.depth_integrated = depth_integrated + self.proc_list = None + self.run_dirs = None super().__init__(test_group=test_group, name=name) - self.cores_set = [16, 32] + def configure(self): + """ + Choose decomposition sizes from framework-detected resources and add + run steps. - for procs in self.cores_set: + The larger decomposition targets up to 32 tasks and requires at least + 10 tasks to run this decomposition test. + """ + available_resources = get_available_parallel_resources(self.config) + target_max_tasks = 32 + smallest_acceptable_max_tasks = 10 + max_tasks = max( + smallest_acceptable_max_tasks, + min(target_max_tasks, available_resources['cores'])) + low_tasks = max(1, max_tasks // 2) + self.proc_list = [low_tasks, max_tasks] + + self.run_dirs = [] + for procs in self.proc_list: name = '{}proc_run'.format(procs) + if name in self.run_dirs: + name = '{}_{}'.format(name, len(self.run_dirs) + 1) + self.run_dirs.append(name) self.add_step( RunModel(test_case=self, name=name, - depth_integrated=depth_integrated, + depth_integrated=self.depth_integrated, ntasks=procs, min_tasks=procs, openmp_threads=1)) - # no configure() method is needed - # no run() method is needed def validate(self): @@ -48,8 +82,8 @@ def validate(self): Test cases can override this method to perform validation of variables and timers """ - name1 = '{}proc_run'.format(self.cores_set[0]) - name2 = '{}proc_run'.format(self.cores_set[1]) + name1 = self.run_dirs[0] + name2 = self.run_dirs[1] # validate thickness compare_variables(test_case=self, variables=['thickness', ], diff --git a/docs/developers_guide/landice/api.rst b/docs/developers_guide/landice/api.rst index ae9736f688..4d7d329c8e 100644 --- a/docs/developers_guide/landice/api.rst +++ b/docs/developers_guide/landice/api.rst @@ -209,6 +209,7 @@ greenland Greenland decomposition_test.DecompositionTest + decomposition_test.DecompositionTest.configure decomposition_test.DecompositionTest.run restart_test.RestartTest @@ -247,9 +248,11 @@ humboldt run_model.RunModel.run decomposition_test.DecompositionTest + decomposition_test.DecompositionTest.configure decomposition_test.DecompositionTest.validate restart_test.RestartTest + restart_test.RestartTest.configure restart_test.RestartTest.validate hydro_radial @@ -460,6 +463,7 @@ thwaites Thwaites decomposition_test.DecompositionTest + decomposition_test.DecompositionTest.configure decomposition_test.DecompositionTest.run restart_test.RestartTest diff --git a/docs/developers_guide/landice/test_groups/greenland.rst b/docs/developers_guide/landice/test_groups/greenland.rst index cdc7e1a3d2..d94a48c9e5 100644 --- a/docs/developers_guide/landice/test_groups/greenland.rst +++ b/docs/developers_guide/landice/test_groups/greenland.rst @@ -60,10 +60,11 @@ decomposition_test ------------------ The :py:class:`compass.landice.tests.greenland.decomposition_test.DecompositionTest` -performs a 5-day run once on 1 core and once on 4 cores. It ensures that -``thickness`` and ``normalVelocity`` are identical at the end of the two runs -(as well as with a baseline if one is provided when calling -:ref:`dev_compass_setup`). +performs a 5-day run with two different decompositions. The larger run targets +32 tasks (or fewer if fewer are available), and the smaller run uses roughly +half as many tasks. It ensures that ``thickness`` and ``normalVelocity`` are +identical at the end of the two runs (as well as with a baseline if one is +provided when calling :ref:`dev_compass_setup`). .. _dev_landice_greenland_restart_test: diff --git a/docs/developers_guide/landice/test_groups/thwaites.rst b/docs/developers_guide/landice/test_groups/thwaites.rst index 7217d948d4..7454f8da51 100644 --- a/docs/developers_guide/landice/test_groups/thwaites.rst +++ b/docs/developers_guide/landice/test_groups/thwaites.rst @@ -42,10 +42,11 @@ decomposition_test ------------------ The :py:class:`compass.landice.tests.thwaites.decomposition_test.DecompositionTest` -performs a 5-day run once on 1 core and once on 4 cores. It ensures that -``thickness`` and ``surfaceSpeed`` are identical at the end of the two runs -(as well as with a baseline if one is provided when calling -:ref:`dev_compass_setup`). +performs a 5-day run with two different decompositions. The larger run targets +32 tasks (or fewer if fewer are available, with a minimum of 10), and the +smaller run uses roughly half as many tasks. It ensures that ``thickness`` and +``surfaceSpeed`` are identical at the end of the two runs (as well as with a +baseline if one is provided when calling :ref:`dev_compass_setup`). .. _dev_landice_thwaites_restart_test: diff --git a/docs/users_guide/landice/test_groups/greenland.rst b/docs/users_guide/landice/test_groups/greenland.rst index d50f2f2a46..c2c0baa9dc 100644 --- a/docs/users_guide/landice/test_groups/greenland.rst +++ b/docs/users_guide/landice/test_groups/greenland.rst @@ -116,9 +116,10 @@ decomposition_test ------------------ ``landice/greenland/decomposition_test`` runs short (5-day) integrations of the -model forward in time on 1 (``1proc_run`` step) and then on 4 cores -(``4proc_run`` step) to make sure the resulting prognostic variables are -bit-for-bit identical between the two runs. +model forward in time with two different decompositions. The larger run +targets 32 tasks (or fewer if fewer are available), and the smaller run uses +roughly half as many tasks. The resulting prognostic variables are compared +between the two runs to make sure they are bit-for-bit identical. restart_test ------------ diff --git a/docs/users_guide/landice/test_groups/humboldt.rst b/docs/users_guide/landice/test_groups/humboldt.rst index 724ff9cf04..0f43f48eef 100644 --- a/docs/users_guide/landice/test_groups/humboldt.rst +++ b/docs/users_guide/landice/test_groups/humboldt.rst @@ -75,13 +75,14 @@ decomposition_tests ------------------- There are a number of variants of a decomposition test that runs a 5-year -simulation on 16 (16proc_run step) and then on 32 cores (32proc_run step) -to make sure key prognostic variables are either bit-fot-bit (without the -FO solver) or have only small differences within a specified tolerance (with -the FO solver). The FO solver is not BFB on different decompositions, but the -differences are small. There are variants of this test for each calving law -that MALI currently supports, paired with either the FO velocity solver or no -velocity solver. +simulation with two different decompositions. The larger run targets 32 tasks +(or fewer if fewer are available), and the smaller run uses roughly half as +many tasks. Key prognostic variables are expected to be either bit-for-bit +(without the FO solver) or to have only small differences within specified +tolerances (with the FO solver). The FO solver is not BFB on different +decompositions, but the differences are small. There are variants of this test +for each calving law that MALI currently supports, paired with either the FO +velocity solver or no velocity solver. The full set of combinations use the 3 km mesh. There is additionally a decomposition test using the 1 km mesh that has calving disabled. Finally, there is a set of "full physics" tests that use von Mises calving, diff --git a/docs/users_guide/landice/test_groups/thwaites.rst b/docs/users_guide/landice/test_groups/thwaites.rst index 9a7c4bf0b1..9f42446ca7 100644 --- a/docs/users_guide/landice/test_groups/thwaites.rst +++ b/docs/users_guide/landice/test_groups/thwaites.rst @@ -70,9 +70,11 @@ decomposition_test ------------------ ``landice/thwaites/decomposition_test`` runs short (2-day) integrations of the -model forward in time on 16 (``16proc_run`` step) and then on 32 cores -(``32proc_run`` step) to make sure the resulting prognostic variables are -bit-for-bit identical between the two runs. +model forward in time with two different decompositions. The larger run +targets 32 tasks (or fewer if fewer are available, with a minimum of 10), and +the smaller run uses roughly half as many tasks. The resulting prognostic +variables are compared between the two runs to make sure they are +bit-for-bit identical. restart_test ------------