Skip to content

Commit eb6c67e

Browse files
vzhestkovdwoz
andcommitted
Make minion reconnecting on changing master IP (bsc#1228182)
* Minions check dns when re-connecting to a master Check for a chainging dns record anytime a minion gets disconnected from it's master. See github issue #63654 #61482. * Regression tests for dns defined masters Adding tests to validate we check for changing dns anytime we're disconnected from the currently connected master * Update docs for master dns changes Update docs to use master_alive_interval to detect master ip changes via DNS. * Remove comment which is not true anymore * Make minion reconnecting on changing master IP with zeromq transport * Don't create schedule for alive if no master_alive_interval * Skip the tests if running with non-root user * Skip if unable to set additional IP address * Set master_tries to -1 for minions * Fix the tests --------- Co-authored-by: Daniel A. Wozniak <[email protected]>
1 parent b9865ba commit eb6c67e

File tree

13 files changed

+422
-120
lines changed

13 files changed

+422
-120
lines changed

conf/minion

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,8 @@
271271
#ping_interval: 0
272272

273273
# To auto recover minions if master changes IP address (DDNS)
274-
# auth_tries: 10
275-
# auth_safemode: True
276-
# ping_interval: 2
274+
# master_alive_interval: 10
275+
# master_tries: -1
277276
#
278277
# Minions won't know master is missing until a ping fails. After the ping fail,
279278
# the minion will attempt authentication and likely fails out and cause a restart.

doc/ref/configuration/minion.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,9 @@ Default: ``0``
291291

292292
Configures how often, in seconds, the minion will verify that the current
293293
master is alive and responding. The minion will try to establish a connection
294-
to the next master in the list if it finds the existing one is dead.
294+
to the next master in the list if it finds the existing one is dead. This
295+
setting can also be used to detect master DNS record changes when a minion has
296+
been disconnected.
295297

296298
.. code-block:: yaml
297299

salt/channel/client.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,6 @@ def connect(self):
385385
# else take the relayed publish_port master reports
386386
else:
387387
publish_port = self.auth.creds["publish_port"]
388-
# TODO: The zeromq transport does not use connect_callback and
389-
# disconnect_callback.
390388
yield self.transport.connect(
391389
publish_port, self.connect_callback, self.disconnect_callback
392390
)

salt/config/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
else:
7676
_DFLT_IPC_MODE = "ipc"
7777
_DFLT_FQDNS_GRAINS = False
78-
_MASTER_TRIES = 1
78+
_MASTER_TRIES = -1
7979
_MASTER_USER = salt.utils.user.get_user()
8080

8181

@@ -1272,7 +1272,7 @@ def _gather_buffer_space():
12721272
"username": None,
12731273
"password": None,
12741274
"zmq_filtering": False,
1275-
"zmq_monitor": False,
1275+
"zmq_monitor": True,
12761276
"cache_sreqs": True,
12771277
"cmd_safe": True,
12781278
"sudo_user": "",

salt/minion.py

Lines changed: 83 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -2737,10 +2737,64 @@ def handle_event(self, package):
27372737
# we are not connected anymore
27382738
self.connected = False
27392739
log.info("Connection to master %s lost", self.opts["master"])
2740+
if self.opts["transport"] != "tcp":
2741+
self.schedule.delete_job(name=master_event(type="alive"))
2742+
2743+
log.info("Trying to tune in to next master from master-list")
2744+
2745+
if hasattr(self, "pub_channel"):
2746+
self.pub_channel.on_recv(None)
2747+
if hasattr(self.pub_channel, "auth"):
2748+
self.pub_channel.auth.invalidate()
2749+
if hasattr(self.pub_channel, "close"):
2750+
self.pub_channel.close()
2751+
if hasattr(self, "req_channel") and self.req_channel:
2752+
self.req_channel.close()
2753+
self.req_channel = None
2754+
2755+
# if eval_master finds a new master for us, self.connected
2756+
# will be True again on successful master authentication
2757+
try:
2758+
master, self.pub_channel = yield self.eval_master(
2759+
opts=self.opts,
2760+
failed=True,
2761+
failback=tag.startswith(master_event(type="failback")),
2762+
)
2763+
except SaltClientError:
2764+
pass
2765+
2766+
if self.connected:
2767+
self.opts["master"] = master
2768+
2769+
# re-init the subsystems to work with the new master
2770+
log.info(
2771+
"Re-initialising subsystems for new master %s",
2772+
self.opts["master"],
2773+
)
2774+
2775+
self.req_channel = salt.channel.client.AsyncReqChannel.factory(
2776+
self.opts, io_loop=self.io_loop
2777+
)
27402778

2741-
if self.opts["master_type"] != "failover":
2742-
# modify the scheduled job to fire on reconnect
2743-
if self.opts["transport"] != "tcp":
2779+
# put the current schedule into the new loaders
2780+
self.opts["schedule"] = self.schedule.option("schedule")
2781+
(
2782+
self.functions,
2783+
self.returners,
2784+
self.function_errors,
2785+
self.executors,
2786+
) = self._load_modules()
2787+
# make the schedule to use the new 'functions' loader
2788+
self.schedule.functions = self.functions
2789+
self.pub_channel.on_recv(self._handle_payload)
2790+
self._fire_master_minion_start()
2791+
log.info("Minion is ready to receive requests!")
2792+
2793+
# update scheduled job to run with the new master addr
2794+
if (
2795+
self.opts["transport"] != "tcp"
2796+
and self.opts["master_alive_interval"] > 0
2797+
):
27442798
schedule = {
27452799
"function": "status.master",
27462800
"seconds": self.opts["master_alive_interval"],
@@ -2749,116 +2803,35 @@ def handle_event(self, package):
27492803
"return_job": False,
27502804
"kwargs": {
27512805
"master": self.opts["master"],
2752-
"connected": False,
2806+
"connected": True,
27532807
},
27542808
}
27552809
self.schedule.modify_job(
27562810
name=master_event(type="alive", master=self.opts["master"]),
27572811
schedule=schedule,
27582812
)
2759-
else:
2760-
# delete the scheduled job to don't interfere with the failover process
2761-
if self.opts["transport"] != "tcp":
2762-
self.schedule.delete_job(name=master_event(type="alive"))
2763-
2764-
log.info("Trying to tune in to next master from master-list")
2765-
2766-
if hasattr(self, "pub_channel"):
2767-
self.pub_channel.on_recv(None)
2768-
if hasattr(self.pub_channel, "auth"):
2769-
self.pub_channel.auth.invalidate()
2770-
if hasattr(self.pub_channel, "close"):
2771-
self.pub_channel.close()
2772-
del self.pub_channel
2773-
2774-
# if eval_master finds a new master for us, self.connected
2775-
# will be True again on successful master authentication
2776-
try:
2777-
master, self.pub_channel = yield self.eval_master(
2778-
opts=self.opts,
2779-
failed=True,
2780-
failback=tag.startswith(master_event(type="failback")),
2781-
)
2782-
except SaltClientError:
2783-
pass
2784-
2785-
if self.connected:
2786-
self.opts["master"] = master
2787-
2788-
# re-init the subsystems to work with the new master
2789-
log.info(
2790-
"Re-initialising subsystems for new master %s",
2791-
self.opts["master"],
2792-
)
2793-
2794-
self.req_channel = (
2795-
salt.transport.client.AsyncReqChannel.factory(
2796-
self.opts, io_loop=self.io_loop
2797-
)
2798-
)
2799-
2800-
# put the current schedule into the new loaders
2801-
self.opts["schedule"] = self.schedule.option("schedule")
2802-
(
2803-
self.functions,
2804-
self.returners,
2805-
self.function_errors,
2806-
self.executors,
2807-
) = self._load_modules()
2808-
# make the schedule to use the new 'functions' loader
2809-
self.schedule.functions = self.functions
2810-
self.pub_channel.on_recv(self._handle_payload)
2811-
self._fire_master_minion_start()
2812-
log.info("Minion is ready to receive requests!")
2813-
2814-
# update scheduled job to run with the new master addr
2815-
if self.opts["transport"] != "tcp":
2816-
schedule = {
2817-
"function": "status.master",
2818-
"seconds": self.opts["master_alive_interval"],
2819-
"jid_include": True,
2820-
"maxrunning": 1,
2821-
"return_job": False,
2822-
"kwargs": {
2823-
"master": self.opts["master"],
2824-
"connected": True,
2825-
},
2826-
}
2827-
self.schedule.modify_job(
2828-
name=master_event(
2829-
type="alive", master=self.opts["master"]
2830-
),
2831-
schedule=schedule,
2832-
)
28332813

2834-
if (
2835-
self.opts["master_failback"]
2836-
and "master_list" in self.opts
2837-
):
2838-
if self.opts["master"] != self.opts["master_list"][0]:
2839-
schedule = {
2840-
"function": "status.ping_master",
2841-
"seconds": self.opts[
2842-
"master_failback_interval"
2843-
],
2844-
"jid_include": True,
2845-
"maxrunning": 1,
2846-
"return_job": False,
2847-
"kwargs": {
2848-
"master": self.opts["master_list"][0]
2849-
},
2850-
}
2851-
self.schedule.modify_job(
2852-
name=master_event(type="failback"),
2853-
schedule=schedule,
2854-
)
2855-
else:
2856-
self.schedule.delete_job(
2857-
name=master_event(type="failback"), persist=True
2858-
)
2859-
else:
2860-
self.restart = True
2861-
self.io_loop.stop()
2814+
if self.opts["master_failback"] and "master_list" in self.opts:
2815+
if self.opts["master"] != self.opts["master_list"][0]:
2816+
schedule = {
2817+
"function": "status.ping_master",
2818+
"seconds": self.opts["master_failback_interval"],
2819+
"jid_include": True,
2820+
"maxrunning": 1,
2821+
"return_job": False,
2822+
"kwargs": {"master": self.opts["master_list"][0]},
2823+
}
2824+
self.schedule.modify_job(
2825+
name=master_event(type="failback"),
2826+
schedule=schedule,
2827+
)
2828+
else:
2829+
self.schedule.delete_job(
2830+
name=master_event(type="failback"), persist=True
2831+
)
2832+
else:
2833+
self.restart = True
2834+
self.io_loop.stop()
28622835

28632836
elif tag.startswith(master_event(type="connected")):
28642837
# handle this event only once. otherwise it will pollute the log
@@ -2870,7 +2843,10 @@ def handle_event(self, package):
28702843
self.connected = True
28712844
# modify the __master_alive job to only fire,
28722845
# if the connection is lost again
2873-
if self.opts["transport"] != "tcp":
2846+
if (
2847+
self.opts["transport"] != "tcp"
2848+
and self.opts["master_alive_interval"] > 0
2849+
):
28742850
schedule = {
28752851
"function": "status.master",
28762852
"seconds": self.opts["master_alive_interval"],

salt/transport/zeromq.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Zeromq transport classes
33
"""
4+
45
import errno
56
import hashlib
67
import logging
@@ -211,6 +212,12 @@ def connect(self, publish_port, connect_callback=None, disconnect_callback=None)
211212
self.master_pub,
212213
)
213214
log.debug("%r connecting to %s", self, self.master_pub)
215+
if (
216+
hasattr(self, "_monitor")
217+
and self._monitor is not None
218+
and disconnect_callback is not None
219+
):
220+
self._monitor.disconnect_callback = disconnect_callback
214221
self._socket.connect(self.master_pub)
215222
connect_callback(True)
216223

@@ -680,13 +687,21 @@ def monitor_callback(self, msg):
680687
log.debug("ZeroMQ event: %s", evt)
681688
if evt["event"] == zmq.EVENT_MONITOR_STOPPED:
682689
self.stop()
690+
elif evt["event"] == zmq.EVENT_DISCONNECTED:
691+
if (
692+
hasattr(self, "disconnect_callback")
693+
and self.disconnect_callback is not None
694+
):
695+
self.disconnect_callback()
683696

684697
def stop(self):
685698
if self._socket is None:
686699
return
687700
self._socket.disable_monitor()
688701
self._socket = None
689-
self._monitor_socket = None
702+
if self._monitor_socket is not None:
703+
self._monitor_socket.close()
704+
self._monitor_socket = None
690705
if self._monitor_stream is not None:
691706
self._monitor_stream.close()
692707
self._monitor_stream = None

tests/pytests/scenarios/dns/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)