Skip to content

Commit fceb770

Browse files
committed
[SmartSwitch] Add graceful shutdown and startup handling in platform daemons
<!-- Provide a general summary of your changes in the Title above --> #### Description <!-- Describe your changes in detail --> HLD: https://github.com/sonic-net/SONiC/blob/master/doc/smart-switch/graceful-shutdown/graceful-shutdown.md These changes build upon enhancements in [`sonic-platform-daemons#667`](sonic-net#667) This PR introduces **graceful shutdown and startup orchestration** across SONiC platform daemons to ensure safe DPU and peripheral module transitions during reboot or administrative state changes. Key updates include: - Integration of `ModuleBase` lifecycle methods (`module_pre_shutdown`, `module_post_startup`, and `set_admin_state_gracefully`) into platform daemons. - Move graceful handling of PCIe detach/reattach and sensor reload sequences into set_admin_state_gracefully. - State tracking in `CHASSIS_MODULE_TABLE` via `STATE_DB` to synchronize transition state across processes. - File-based operation locks to prevent concurrent access to shared hardware resources. #### Motivation and Context <!-- Why is this change required? What problem does it solve? If this pull request closes/resolves an open Issue, make sure you include the text "fixes #xxxx", "closes #xxxx" or "resolves #xxxx" here --> Platform daemons currently perform shutdown and startup independently, leading to: - Race conditions during DPU detachment. - Inconsistent Redis state across PMON daemons. - Uncoordinated sensor and PCIe transitions during reboot. This change introduces a unified **graceful shutdown framework** for SmartSwitch modules. It ensures predictable module transitions, preserves hardware health, and supports orchestrated restarts without transient hardware errors. #### How Has This Been Tested? <!-- Please describe in detail how you tested your changes. Include details of your testing environment, and the tests you ran to see how your change affects other areas of the code, etc. --> Testing performed on both **DPU-enabled (SmartSwitch)**. **Functional validation** - Verified end-to-end reboot flow with DPU detach/reattach sequence. - PCIe state (`detaching/attaching`) reflected in `STATE_DB`. - `pcied` daemon logs confirm ordered detach before reboot and reattach after startup. - Confirmed no stale Redis entries or orphaned locks post-reboot. **Unit tests executed** - tests/test_DaemonPcied.py - tests/test_chassisd_graceful.py Coverage includes: - Transition flag handling - Timeout behavior - DB write/read operations - Graceful admin state flow **Manual validation** #### Additional Information (Optional)
1 parent 69ce387 commit fceb770

File tree

3 files changed

+234
-196
lines changed

3 files changed

+234
-196
lines changed

sonic-chassisd/scripts/chassisd

Lines changed: 18 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -248,22 +248,13 @@ class SmartSwitchModuleConfigUpdater(logger.Logger):
248248

249249
if (admin_state == MODULE_ADMIN_DOWN) or (admin_state == MODULE_ADMIN_UP):
250250
self.log_info("Changing module {} to admin {} state".format(key, 'DOWN' if admin_state == MODULE_ADMIN_DOWN else 'UP'))
251-
t = threading.Thread(target=self.submit_callback, args=(module_index, admin_state, key))
251+
t = threading.Thread(target=self.submit_callback, args=(module_index, admin_state))
252252
t.start()
253253
else:
254254
self.log_warning("Invalid admin_state value: {}".format(admin_state))
255255

256-
def submit_callback(self, module_index, admin_state, key):
257-
if admin_state == MODULE_ADMIN_DOWN:
258-
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
259-
# there are no actions taken during this function execution.
260-
try_get(self.chassis.get_module(module_index).module_pre_shutdown, default=False)
261-
try_get(self.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
262-
if admin_state == MODULE_ADMIN_UP:
263-
# This is only valid on platforms which have pci_rescan sensord changes required. If it is not implemented,
264-
# there are no actions taken during this function execution.
265-
try_get(self.chassis.get_module(module_index).module_post_startup, default=False)
266-
pass
256+
def submit_callback(self, module_index, admin_state):
257+
try_get(self.chassis.get_module(module_index).set_admin_state_gracefully, admin_state, default=False)
267258

268259
#
269260
# Module Updater ==============================================================
@@ -723,7 +714,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
723714
self.module_reboot_table = swsscommon.Table(self.chassis_state_db, CHASSIS_MODULE_REBOOT_INFO_TABLE)
724715
self.down_modules = {}
725716
self.chassis_app_db_clean_sha = None
726-
self.module_transition_flag_helper = ModuleTransitionFlagHelper()
727717

728718
self.midplane_initialized = try_get(chassis.init_midplane_switch, default=False)
729719
if not self.midplane_initialized:
@@ -815,9 +805,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
815805
# Persist dpu down time
816806
self.persist_dpu_reboot_time(key)
817807
# persist reboot cause
818-
# Clear transition flag in STATE_DB
819-
self.module_transition_flag_helper.clear_transition_flag(key)
820-
821808
reboot_cause = try_get(self.chassis.get_module(module_index).get_reboot_cause)
822809
self.persist_dpu_reboot_cause(reboot_cause, key)
823810
# publish reboot cause to db
@@ -852,9 +839,6 @@ class SmartSwitchModuleUpdater(ModuleUpdater):
852839
self.persist_dpu_reboot_cause(reboot_cause, key)
853840
self.update_dpu_reboot_cause_to_db(key)
854841

855-
# Clear transition flag in STATE_DB
856-
self.module_transition_flag_helper.clear_transition_flag(key)
857-
858842
def _get_module_info(self, module_index):
859843
"""
860844
Retrieves module info of this module
@@ -1336,34 +1320,6 @@ class DpuStateUpdater(logger.Logger):
13361320
self._update_dp_dpu_state('down')
13371321
self._update_cp_dpu_state('down')
13381322

1339-
class ModuleTransitionFlagHelper(logger.Logger):
1340-
def __init__(self, log_identifier = SYSLOG_IDENTIFIER):
1341-
super(ModuleTransitionFlagHelper, self).__init__(log_identifier)
1342-
# Use new connector to avoid redis failures
1343-
"""Create a helper function to get the module table,
1344-
since multiple threads updating with the same connector will cause redis failures"""
1345-
state_db = daemon_base.db_connect("STATE_DB")
1346-
self.module_table = swsscommon.Table(state_db, CHASSIS_MODULE_INFO_TABLE)
1347-
1348-
def set_transition_flag(self, module_name):
1349-
try:
1350-
self.module_table.hset(module_name, 'state_transition_in_progress', 'True')
1351-
self.module_table.hset(module_name, 'transition_start_time', datetime.now(timezone.utc).replace(tzinfo=None).isoformat())
1352-
except Exception as e:
1353-
self.log_error(f"Error setting transition flag for {module_name}: {e}")
1354-
1355-
def clear_transition_flag(self, module_name):
1356-
try:
1357-
self.log_info(f"Clearing transition flag for {module_name}")
1358-
self.module_table.hdel(module_name, 'state_transition_in_progress')
1359-
self.module_table.hdel(module_name, 'transition_start_time')
1360-
except Exception as e:
1361-
self.log_error(f"Error clearing transition flag for {module_name}: {e}")
1362-
1363-
def clear_all_transition_flags(self):
1364-
for module_name in self.module_table.getKeys():
1365-
self.clear_transition_flag(module_name)
1366-
13671323
#
13681324
# Daemon =======================================================================
13691325
#
@@ -1400,22 +1356,30 @@ class ChassisdDaemon(daemon_base.DaemonBase):
14001356
else:
14011357
self.log_warning("Caught unhandled signal '{}' - ignoring...".format(SIGNALS_TO_NAMES_DICT[sig]))
14021358

1403-
def submit_dpu_callback(self, module_index, admin_state, module_name):
1359+
def submit_dpu_callback(self, module_index, admin_state):
1360+
module = self.module_updater.chassis.get_module(module_index)
1361+
14041362
# This is only valid on platforms which have pci_detach and sensord changes required. If it is not implemented,
14051363
# there are no actions taken during this function execution.
1406-
try_get(self.module_updater.chassis.get_module(module_index).module_pre_shutdown, default=False)
1364+
if admin_state == MODULE_PRE_SHUTDOWN:
1365+
try_get(module.module_pre_shutdown, default=False)
14071366
# Set admin_state change in progress using the centralized method
14081367
if admin_state == MODULE_ADMIN_DOWN:
1409-
ModuleTransitionFlagHelper().set_transition_flag(module_name)
1410-
try_get(self.module_updater.chassis.get_module(module_index).set_admin_state, admin_state, default=False)
1368+
try_get(module.set_admin_state_gracefully, admin_state, default=False)
14111369

14121370
def set_initial_dpu_admin_state(self):
14131371
"""Send admin_state trigger once to modules those are powered up"""
14141372
threads = []
14151373
for module_index in range(0, self.module_updater.num_modules):
1416-
op = None
1417-
# Get operational state of DPU
14181374
module_name = self.platform_chassis.get_module(module_index).get_name()
1375+
1376+
# Clear any existing state transition flags
1377+
module = self.module_updater.chassis.get_module(module_index)
1378+
module.clear_module_state_transition(module_name)
1379+
module.clear_module_gnoi_halt_in_progress()
1380+
1381+
# Get operational state of DPU
1382+
op = None
14191383
operational_state = self.platform_chassis.get_module(module_index).get_oper_status()
14201384

14211385
try:
@@ -1437,7 +1401,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):
14371401

14381402
if op is not None:
14391403
# Create and start a thread for the DPU logic
1440-
thread = threading.Thread(target=self.submit_dpu_callback, args=(module_index, op, module_name))
1404+
thread = threading.Thread(target=self.submit_dpu_callback, args=(module_index, op))
14411405
thread.daemon = True # Set as a daemon thread
14421406
thread.start()
14431407
threads.append(thread)
@@ -1486,16 +1450,7 @@ class ChassisdDaemon(daemon_base.DaemonBase):
14861450

14871451
# Set the initial DPU admin state for SmartSwitch
14881452
if self.smartswitch:
1489-
# Clear all stale transition flags for SmartSwitch on startup
1490-
ModuleTransitionFlagHelper().clear_all_transition_flags()
1491-
self.set_initial_dpu_admin_state()
1492-
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1493-
module_transition_flag_helper = ModuleTransitionFlagHelper()
1494-
# Clear all stale transition flags for SmartSwitch on startup
1495-
module_transition_flag_helper.clear_all_transition_flags()
14961453
self.set_initial_dpu_admin_state()
1497-
# Clear all transition flags for SmartSwitch after setting the initial DPU admin state
1498-
module_transition_flag_helper.clear_all_transition_flags()
14991454

15001455
while not self.stop.wait(CHASSIS_INFO_UPDATE_PERIOD_SECS):
15011456
self.module_updater.module_db_update()

sonic-chassisd/tests/mock_platform.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,18 @@ def module_pre_shutdown(self):
7878
def module_post_startup(self):
7979
pass
8080

81+
def set_admin_state_gracefully(self, up):
82+
"""Mock implementation of set_admin_state_gracefully"""
83+
return self.set_admin_state(up)
84+
85+
def clear_module_state_transition(self, module_name):
86+
"""Mock implementation of clear_module_state_transition"""
87+
return True
88+
89+
def clear_module_gnoi_halt_in_progress(self):
90+
"""Mock implementation of clear_module_gnoi_halt_in_progress"""
91+
return True
92+
8193
def is_midplane_reachable(self):
8294
return self.midplane_access
8395

0 commit comments

Comments
 (0)