diff --git a/.github/workflows/ci-verification.yml b/.github/workflows/ci-verification.yml index 9c0d23fedde1..2908f0b4a50c 100644 --- a/.github/workflows/ci-verification.yml +++ b/.github/workflows/ci-verification.yml @@ -39,7 +39,7 @@ jobs: - name: Install TLC dependencies run: | tdnf install -y jre wget - python3 tla/install_deps.py --skip-apt-packages + python3 tla/install_deps.py - run: cd tla && ./tlc.py mc consistency/MCSingleNode.tla - run: cd tla && ./tlc.py mc consistency/MCSingleNodeReads.tla @@ -68,7 +68,7 @@ jobs: - name: Install TLC dependencies run: | sudo apt update - sudo apt install -y default-jre + sudo apt install -y default-jre wget python3 install_deps.py - run: ./tlc_debug.sh --config consistency/MCSingleNodeCommitReachability.cfg mc consistency/MCSingleNodeReads.tla @@ -88,7 +88,7 @@ jobs: - name: Install TLC dependencies run: | sudo apt update - sudo apt install -y default-jre + sudo apt install -y default-jre wget python3 install_deps.py - run: ./tlc.py sim --num 500 --depth 50 consistency/MultiNodeReads.tla @@ -121,7 +121,7 @@ jobs: - name: Install TLC dependencies run: | tdnf install -y jre wget - python3 tla/install_deps.py --skip-apt-packages + python3 tla/install_deps.py - run: cd tla && ./tlc.py mc consensus/MCabs.tla - run: cd tla && ./tlc.py --trace-name 1C2N mc --term-count 2 --request-count 2 --raft-configs 1C2N consensus/MCccfraft.tla @@ -148,7 +148,7 @@ jobs: - name: Install TLC dependencies run: | sudo apt update - sudo apt install -y default-jre + sudo apt install -y default-jre wget python3 install_deps.py - run: ./tlc.py sim consensus/SIMccfraft.tla @@ -181,22 +181,16 @@ jobs: with: fetch-depth: 0 - - name: Install TLC dependencies - run: | - tdnf install -y jre wget - python3 tla/install_deps.py --skip-apt-packages - - name: "Install dependencies" shell: bash run: | set -ex ./scripts/setup-ci.sh - # Parallel - wget https://ftp.gnu.org/gnu/parallel/parallel-latest.tar.bz2 - tar -xjf parallel-latest.tar.bz2 - cd $(ls | grep 'parallel' | grep -v 'tar' | grep -v 'rpm') - ./configure && make && make install + - name: Install TLC dependencies + run: | + tdnf install -y jre wget + python3 tla/install_deps.py --tdnf-extended - name: "Build" run: | diff --git a/.github/workflows/long-verification.yml b/.github/workflows/long-verification.yml index 2cdb138cd282..22dd3d4e6320 100644 --- a/.github/workflows/long-verification.yml +++ b/.github/workflows/long-verification.yml @@ -37,7 +37,7 @@ jobs: - name: Install TLC dependencies run: | tdnf install -y jre wget - python3 tla/install_deps.py --skip-apt-packages + python3 tla/install_deps.py - run: cd tla && ./tlc.py --trace-name 2C2N mc --term-count 2 --request-count 0 --raft-configs 2C2N --disable-check-quorum consensus/MCccfraft.tla @@ -70,7 +70,7 @@ jobs: - name: Install TLC dependencies run: | tdnf install -y jre wget - python3 tla/install_deps.py --skip-apt-packages + python3 tla/install_deps.py - run: cd tla && ./tlc.py --trace-name 3C2N mc --term-count 2 --request-count 0 --raft-configs 3C2N --disable-check-quorum consensus/MCccfraft.tla @@ -95,7 +95,7 @@ jobs: - uses: actions/checkout@v4 - run: | sudo apt update - sudo apt install -y default-jre + sudo apt install -y default-jre wget python3 install_deps.py - run: ./tlc.py sim --max-seconds 3000 --depth 500 consensus/SIMccfraft.tla diff --git a/CHANGELOG.md b/CHANGELOG.md index 7347e857c30e..a6eafdd4f53b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). +## [6.0.17] + +[6.0.17]: https://github.com/microsoft/CCF/releases/tag/6.0.17 + +### Added + +- Support for PreVote optimisation. Nodes understand and are able to respond to PreVote messages, but will not become pre-vote candidates themselves. (#7419, #7445) + +### Fixed + +- CheckQuorum now requires a quorum in every configuration (#7375) + ## [6.0.16] [6.0.16]: https://github.com/microsoft/CCF/releases/tag/6.0.16 diff --git a/doc/architecture/consensus/index.rst b/doc/architecture/consensus/index.rst index deda4375cba4..a75f9b660ff2 100644 --- a/doc/architecture/consensus/index.rst +++ b/doc/architecture/consensus/index.rst @@ -25,6 +25,7 @@ Supported extensions include: - "CheckQuorum": the primary node automatically steps down, in the same view, if it does not hear back (via ``AppendEntriesResponse`` messages) from a majority of backups within a ``consensus.election_timeout`` period. This prevents an isolated primary node from still processing client write requests without being able to commit them. - "NoTimeoutRetirement": a primary node that completes its retirement sends a ProposeRequestVote message to the most up-to-date node in the new configuration, causing that node to run for election without waiting for time out. +- "PreVote": followers must first request a pre-vote before starting a new election. This prevents followers from starting elections (and increasing the term) when they are isolated from the rest of the network. Replica State Machine --------------------- @@ -206,3 +207,75 @@ Until the very last phase (``RetiredCommitted``) is reached, a retiring leader w Note that because the rollback triggered when a node becomes aware of a new term never preserves unsigned transactions, and because RCI is always the first signature after RI, RI and RCI are always both rolled back if RCI itself is rolled back. + +PreVote Extensions +~~~~~~~~~~~~~~~~~~ + +If a node's ``RequestVote`` requests are able to reach the cluster, but it is unable to hear the ``AppendEntries`` messages from the current leader (for example, due to network partitioning), it may start new elections, incrementing its term, which deposes the leader and disrupts the cluster. + +To mitigate this, the PreVote extension requires that a follower first become ``PreVoteCandidate`` and receive a quorum of speculative pre-votes, proving that they could be elected using the standard Raft election conditions, before becoming ``Candidate`` and potentially disrupting the cluster. + +More specifically, when a follower's election timeout elapses, it becomes a ``PreVoteCandidate`` for the current term and sends out ``RequestVote`` messages with the ``electionType`` set to ``ElectionType::PreVote``. +If the ``PreVoteCandidate`` hears from a current leader, or a new leader, it reverts back to being a ``Follower``. +Nodes receive this pre-vote request, and respond positively if node would have voted for the ``PreVoteCandidate``'s ledger during an election, (ie. if the ``PreVoteCandidate``'s ledger is at least as up to date as the receiver's ledger). +If the ``PreVoteCandidate`` receives a quorum of positive pre-vote responses, it then becomes a ``Candidate``, increments its term, sends a ``RequestVote`` message with ``election_type`` set to ``ElectionType::RegularVote`` and the election proceeds as normal from here. + +.. mermaid:: + + sequenceDiagram + participant Node 0 + participant Node 1 + participant Node 2 + + Note over Node 0: Leader for term 2 + + Note over Node 1: PreVoteCandidate in term 2 + Node 1 ->> Node 2: RequestVote(ElectionType::PreVote, term=2) + + Note right of Node 2: No changes to Node 2's state + Node 2 ->> Node 1: RequestVoteResponse(ElectionType::PreVote, term=2, granted=true) + + Note over Node 1: Candidate in term 3 + Node 1 ->> Node 2: RequestVote(ElectionType::RegularVote, term=3) + + Note right of Node 2: Updates term to 3 and votes for Node 1 + Node 2 ->> Node 1: RequestVoteResponse(ElectionType::RegularVote, term=3, granted=true) + + Note over Node 1: Leader for term 3 + +The only state update in response to a pre-vote message is that if the node's term is older than the pre-vote messages's it will update it. +This allows the pre-vote request to inform lagging nodes that a more recent term had a node succeed in its pre-vote, becoming a Candidate or a Leader. +This can be viewed as piggybacking the term information from that previous Candidate or Leader, with the pre-vote request to the lagging node. + +.. mermaid:: + + sequenceDiagram + participant Node 0 + participant Node 1 + participant Node 2 + + Note over Node 0: Leader for term 2 + Note over Node 1: Follower in term 2 + Note over Node 2: Lagging Follower in term 1 + + Note over Node 1: PreVoteCandidate in term 2 + Node 1 ->> Node 2: RequestVote(ElectionType::PreVote, term=2) + + Note right of Node 2: Updates term to 2 + Node 2 ->> Node 1: RequestVoteResponse(ElectionType::PreVote, term=2, granted=true) + + Note over Node 1: Candidate in term 3 + Node 1 ->> Node 2: RequestVote(ElectionType::RegularVote, term=3) + + Note right of Node 2: Updates to term 3 and votes for Node 1 + Node 2 ->> Node 1: RequestVoteResponse(ElectionType::RegularVote, term=3, granted=true) + + Note over Node 1: Leader for term 3 + +Migration to PreVote +~~~~~~~~~~~~~~~~~~~~ + +Supposing we have a cluster of nodes which currently do not support PreVote, we must first migrate the cluster to support PreVote before we can enable it, as the nodes that do not support PreVote will respond incorrectly to PreVote requests. + +To enable PreVote safely, we must first migrate the cluster to support PreVote messages, and then enable PreVote. +During the migration to enable PreVote, the pre-vote candidates will be less likely to be elected leader, as the other followers may preempt the pre-vote candidate and become candidates themselves. diff --git a/doc/architecture/raft_tla.rst b/doc/architecture/raft_tla.rst index 19f6ac410a1a..fdde71565601 100644 --- a/doc/architecture/raft_tla.rst +++ b/doc/architecture/raft_tla.rst @@ -44,4 +44,7 @@ It is possible to produce fresh traces quickly from the driver by running the `` Calling the trace validation on, for example, the ``append`` scenario can then be done with ``./tlc.py --driver-trace ../build/append.ndjson consensus/Traceccfraft.tla``. +Generating a trace of a scenario and validating it in one go can be done with ``./tlc.py --workers 1 tv --scenario ../tests/raft_scenarios/append consensus/Traceccfraft.tla``. +This runs the raft_driver on the scenario, cleans the trace and then validates it against the TLA+ specification. + CCF also provides a command line trace visualizer to aid debugging, for example, the ``append`` scenario can be visualized with ``python ../tests/trace_viz.py ../build/append.ndjson``. diff --git a/doc/schemas/node_openapi.json b/doc/schemas/node_openapi.json index 3b682f62ddce..97246a8ca76a 100644 --- a/doc/schemas/node_openapi.json +++ b/doc/schemas/node_openapi.json @@ -464,6 +464,7 @@ "None", "Leader", "Follower", + "PreVoteCandidate", "Candidate" ], "type": "string" @@ -904,7 +905,7 @@ "info": { "description": "This API provides public, uncredentialed access to service and node state.", "title": "CCF Public Node API", - "version": "4.13.0" + "version": "4.14.0" }, "openapi": "3.0.0", "paths": { diff --git a/python/pyproject.toml b/python/pyproject.toml index 8c51ae195967..4cb87953b8f9 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ccf" -version = "6.0.16" +version = "6.0.17" authors = [ { name="CCF Team", email="CCF-Sec@microsoft.com" }, ] diff --git a/src/consensus/aft/impl/state.h b/src/consensus/aft/impl/state.h index b41bc4cfe9aa..dbb3a5166938 100644 --- a/src/consensus/aft/impl/state.h +++ b/src/consensus/aft/impl/state.h @@ -150,7 +150,10 @@ namespace aft struct State { - State(const ccf::NodeId& node_id_) : node_id(node_id_) {} + State(const ccf::NodeId& node_id_, bool pre_vote_enabled_ = false) : + node_id(node_id_), + pre_vote_enabled(pre_vote_enabled_) + {} State() = default; ccf::pal::Mutex lock; @@ -188,6 +191,8 @@ namespace aft // Index at which this node observes its retired_committed, only set when // that index itself is committed std::optional retired_committed_idx = std::nullopt; + + bool pre_vote_enabled = false; }; DECLARE_JSON_TYPE_WITH_OPTIONAL_FIELDS(State); DECLARE_JSON_REQUIRED_FIELDS( @@ -197,7 +202,8 @@ namespace aft last_idx, commit_idx, leadership_state, - membership_state); + membership_state, + pre_vote_enabled); DECLARE_JSON_OPTIONAL_FIELDS( State, retirement_phase, diff --git a/src/consensus/aft/raft.h b/src/consensus/aft/raft.h index 73f3e7a96aab..8a6d11af4f8e 100644 --- a/src/consensus/aft/raft.h +++ b/src/consensus/aft/raft.h @@ -771,6 +771,15 @@ namespace aft break; } + case raft_request_pre_vote: + { + RequestPreVote r = + channels->template recv_authenticated( + from, data, size); + recv_request_pre_vote(from, r); + break; + } + case raft_request_vote: { RequestVote r = channels->template recv_authenticated( @@ -779,6 +788,15 @@ namespace aft break; } + case raft_request_pre_vote_response: + { + RequestPreVoteResponse r = + channels->template recv_authenticated( + from, data, size); + recv_request_pre_vote_response(from, r); + break; + } + case raft_request_vote_response: { RequestVoteResponse r = @@ -846,38 +864,36 @@ namespace aft node.second.last_ack_timeout += elapsed; } - bool has_quorum_of_backups = false; - for (auto const& conf : configurations) - { - size_t backup_ack_timeout_count = 0; - for (auto const& node : conf.nodes) - { - auto search = all_other_nodes.find(node.first); - if (search == all_other_nodes.end()) - { - // Ignore ourselves as primary - continue; - } - if (search->second.last_ack_timeout >= election_timeout) + bool every_active_config_has_a_quorum = std::all_of( + configurations.begin(), + configurations.end(), + [this](const Configuration& conf) { + size_t live_nodes_in_config = 0; + for (auto const& node : conf.nodes) { - RAFT_DEBUG_FMT( - "No ack received from {} in last {}", - node.first, - election_timeout); - backup_ack_timeout_count++; + auto search = all_other_nodes.find(node.first); + if ( + // if a (non-self) node is in a configuration, then it is in + // all_other_nodes. So if a node in a configuration is not found + // in all_other_nodes, it must be self, and hence is live + search == all_other_nodes.end() || + // Otherwise we use the most recent ack as a failure probe + search->second.last_ack_timeout < election_timeout) + { + ++live_nodes_in_config; + } + else + { + RAFT_DEBUG_FMT( + "No ack received from {} in last {}", + node.first, + election_timeout); + } } - } - - if (backup_ack_timeout_count < get_quorum(conf.nodes.size() - 1)) - { - // If primary has quorum of active backups in _any_ configuration, - // it should remain primary - has_quorum_of_backups = true; - break; - } - } + return live_nodes_in_config >= get_quorum(conf.nodes.size()); + }); - if (!has_quorum_of_backups) + if (!every_active_config_has_a_quorum) { // CheckQuorum: The primary automatically steps down if there are no // active configuration in which it has heard back from a majority of @@ -898,7 +914,14 @@ namespace aft timeout_elapsed >= election_timeout) { // Start an election. - become_candidate(); + if (state->pre_vote_enabled) + { + become_pre_vote_candidate(); + } + else + { + become_candidate(); + } } } } @@ -1034,16 +1057,6 @@ namespace aft const auto prev_term = get_term_internal(prev_idx); const auto term_of_idx = get_term_internal(end_idx); - RAFT_DEBUG_FMT( - "Send append entries from {} to {}: ({}.{}, {}.{}] ({})", - state->node_id, - to, - prev_term, - prev_idx, - term_of_idx, - end_idx, - state->commit_idx); - #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wc99-designator" AppendEntries ae{ @@ -1056,6 +1069,17 @@ namespace aft }; #pragma clang diagnostic pop + RAFT_DEBUG_FMT( + "Send {} from {} to {}: ({}.{}, {}.{}] ({})", + ae.msg, + state->node_id, + to, + prev_term, + prev_idx, + term_of_idx, + end_idx, + state->commit_idx); + auto& node = all_other_nodes.at(to); #ifdef CCF_RAFT_TRACING @@ -1091,12 +1115,14 @@ namespace aft std::unique_lock guard(state->lock); RAFT_DEBUG_FMT( - "Received append entries: {}.{} to {}.{} (from {} in term {})", + "Recv {} to {} from {}: {}.{} to {}.{} in term {}", + r.msg, + state->node_id, + from, r.prev_term, r.prev_idx, r.term_of_idx, r.idx, - from, r.term); #ifdef CCF_RAFT_TRACING @@ -1117,7 +1143,8 @@ namespace aft // follower if necessary if ( state->current_view == r.term && - state->leadership_state == ccf::kv::LeadershipState::Candidate) + (state->leadership_state == ccf::kv::LeadershipState::Candidate || + state->leadership_state == ccf::kv::LeadershipState::PreVoteCandidate)) { become_aware_of_new_term(r.term); } @@ -1129,7 +1156,8 @@ namespace aft { // Reply false, since our term is later than the received term. RAFT_INFO_FMT( - "Recv append entries to {} from {} but our term is later ({} > {})", + "Recv {} to {} from {} but our term is later ({} > {})", + r.msg, state->node_id, from, state->current_view, @@ -1150,8 +1178,9 @@ namespace aft if (prev_term == 0) { RAFT_DEBUG_FMT( - "Recv append entries to {} from {} but our log does not yet " + "Recv {} to {} from {} but our log does not yet " "contain index {}", + r.msg, state->node_id, from, r.prev_idx); @@ -1160,8 +1189,9 @@ namespace aft else { RAFT_DEBUG_FMT( - "Recv append entries to {} from {} but our log at {} has the wrong " + "Recv {} to {} from {} but our log at {} has the wrong " "previous term (ours: {}, theirs: {})", + r.msg, state->node_id, from, r.prev_idx, @@ -1187,8 +1217,9 @@ namespace aft if (r.prev_idx < state->commit_idx) { RAFT_DEBUG_FMT( - "Recv append entries to {} from {} but prev_idx ({}) < commit_idx " + "Recv {} to {} from {} but prev_idx ({}) < commit_idx " "({})", + r.msg, state->node_id, from, r.prev_idx, @@ -1202,7 +1233,8 @@ namespace aft else if (r.prev_idx > state->last_idx) { RAFT_FAIL_FMT( - "Recv append entries to {} from {} but prev_idx ({}) > last_idx ({})", + "Recv {} to {} from {} but prev_idx ({}) > last_idx ({})", + r.msg, state->node_id, from, r.prev_idx, @@ -1211,7 +1243,8 @@ namespace aft } RAFT_DEBUG_FMT( - "Recv append entries to {} from {} for index {} and previous index {}", + "Recv {} to {} from {} for index {} and previous index {}", + r.msg, state->node_id, from, r.idx, @@ -1293,7 +1326,8 @@ namespace aft { // This should only fail if there is malformed data. RAFT_FAIL_FMT( - "Recv append entries to {} from {} but the data is malformed: {}", + "Recv {} to {} from {} but the data is malformed: {}", + r.msg, state->node_id, from, e.what()); @@ -1306,8 +1340,9 @@ namespace aft if (ds == nullptr) { RAFT_FAIL_FMT( - "Recv append entries to {} from {} but the entry could not be " + "Recv {} to {} from {} but the entry could not be " "deserialised", + r.msg, state->node_id, from); send_append_entries_response_nack(from); @@ -1506,19 +1541,20 @@ namespace aft aft::Term response_term, aft::Index response_idx) { - RAFT_DEBUG_FMT( - "Send append entries response from {} to {} for index {}: {}", - state->node_id, - to, - response_idx, - (answer == AppendEntriesResponseType::OK ? "ACK" : "NACK")); - AppendEntriesResponse response{ .term = response_term, .last_log_idx = response_idx, .success = answer, }; + RAFT_DEBUG_FMT( + "Send {} from {} to {} for index {}: {}", + response.msg, + state->node_id, + to, + response_idx, + (answer == AppendEntriesResponseType::OK ? "ACK" : "NACK")); + #ifdef CCF_RAFT_TRACING nlohmann::json j = {}; j["function"] = "send_append_entries_response"; @@ -1564,8 +1600,9 @@ namespace aft // Ignore if we're not the leader. if (state->leadership_state != ccf::kv::LeadershipState::Leader) { - RAFT_FAIL_FMT( - "Recv append entries response to {} from {}: no longer leader", + RAFT_INFO_FMT( + "Recv {} to {} from {}: no longer leader", + r.msg, state->node_id, from); return; @@ -1578,8 +1615,9 @@ namespace aft { // We are behind, update our state. RAFT_DEBUG_FMT( - "Recv append entries response to {} from {}: more recent term ({} " + "Recv {} to {} from {}: more recent term ({} " "> {})", + r.msg, state->node_id, from, r.term, @@ -1597,7 +1635,8 @@ namespace aft if (r.success == AppendEntriesResponseType::OK) { RAFT_DEBUG_FMT( - "Recv append entries response to {} from {}: stale term ({} != {})", + "Recv {} to {} from {}: stale term ({} != {})", + r.msg, state->node_id, from, r.term, @@ -1615,9 +1654,7 @@ namespace aft if (r.success == AppendEntriesResponseType::OK) { RAFT_DEBUG_FMT( - "Recv append entries response to {} from {}: stale idx", - state->node_id, - from); + "Recv {} to {} from {}: stale idx", r.msg, state->node_id, from); return; } } @@ -1627,9 +1664,7 @@ namespace aft { // Failed due to log inconsistency. Reset sent_idx, and try again soon. RAFT_DEBUG_FMT( - "Recv append entries response to {} from {}: failed", - state->node_id, - from); + "Recv {} to {} from {}: failed", r.msg, state->node_id, from); const auto this_match = find_highest_possible_match({r.term, r.last_log_idx}); node->second.sent_idx = std::max( @@ -1645,28 +1680,48 @@ namespace aft } RAFT_DEBUG_FMT( - "Recv append entries response to {} from {} for index {}: success", + "Recv {} to {} from {} for index {}: success", + r.msg, state->node_id, from, r.last_log_idx); update_commit(); } + void send_request_pre_vote(const ccf::NodeId& to) + { + auto last_committable_idx = last_committable_index(); + CCF_ASSERT(last_committable_idx >= state->commit_idx, "lci < ci"); + + RequestPreVote rpv{ + .term = state->current_view, + .last_committable_idx = last_committable_idx, + .term_of_last_committable_idx = + get_term_internal(last_committable_idx)}; + +#ifdef CCF_RAFT_TRACING + nlohmann::json j = {}; + j["function"] = "send_request_vote"; + j["packet"] = rpv; + j["state"] = *state; + COMMITTABLE_INDICES(j["state"], state); + j["to_node_id"] = to; + RAFT_TRACE_JSON_OUT(j); +#endif + + channels->send_authenticated(to, ccf::NodeMsgType::consensus_msg, rpv); + } + void send_request_vote(const ccf::NodeId& to) { auto last_committable_idx = last_committable_index(); - RAFT_INFO_FMT( - "Send request vote from {} to {} at {}", - state->node_id, - to, - last_committable_idx); CCF_ASSERT(last_committable_idx >= state->commit_idx, "lci < ci"); RequestVote rv{ .term = state->current_view, .last_committable_idx = last_committable_idx, - .term_of_last_committable_idx = get_term_internal(last_committable_idx), - }; + .term_of_last_committable_idx = + get_term_internal(last_committable_idx)}; #ifdef CCF_RAFT_TRACING nlohmann::json j = {}; @@ -1681,10 +1736,9 @@ namespace aft channels->send_authenticated(to, ccf::NodeMsgType::consensus_msg, rv); } - void recv_request_vote(const ccf::NodeId& from, RequestVote r) + void recv_request_vote_unsafe( + const ccf::NodeId& from, RequestVote r, ElectionType election_type) { - std::lock_guard guard(state->lock); - // Do not check that from is a known node. It is possible to receive // RequestVotes from nodes that this node doesn't yet know, just as it // receives AppendEntries from those nodes. These should be obeyed just @@ -1692,62 +1746,63 @@ namespace aft // produce a primary in the new term, who will then help this node catch // up. -#ifdef CCF_RAFT_TRACING - nlohmann::json j = {}; - j["function"] = "recv_request_vote"; - j["packet"] = r; - j["state"] = *state; - COMMITTABLE_INDICES(j["state"], state); - j["from_node_id"] = from; - RAFT_TRACE_JSON_OUT(j); -#endif - if (state->current_view > r.term) { // Reply false, since our term is later than the received term. RAFT_DEBUG_FMT( - "Recv request vote to {} from {}: our term is later ({} > {})", + "Recv {} to {} from {}: our term is later ({} > {})", + r.msg, state->node_id, from, state->current_view, r.term); - send_request_vote_response(from, false); + send_request_vote_response(from, false, election_type); return; } - else if (state->current_view < r.term) + if (state->current_view < r.term) { RAFT_DEBUG_FMT( - "Recv request vote to {} from {}: their term is later ({} < {})", + "Recv {} to {} from {}: their term is later ({} < {})", + r.msg, state->node_id, from, state->current_view, r.term); + + // Even if ElectionType::PreVote, we should still update the term. + // A pre-vote-candidate does not update its term until it becomes a + // candidate. So a pre-vote request from a higher term indicates that we + // should catch up to the term that had a candidate in it. become_aware_of_new_term(r.term); } - if (leader_id.has_value()) + bool grant_vote = true; + + if ((election_type == ElectionType::RegularVote) && leader_id.has_value()) { // Reply false, since we already know the leader in the current term. RAFT_DEBUG_FMT( - "Recv request vote to {} from {}: leader {} already known in term {}", + "Recv {} to {} from {}: leader {} already known in term {}", + r.msg, state->node_id, from, leader_id.value(), state->current_view); - send_request_vote_response(from, false); - return; + grant_vote = false; } - if ((voted_for.has_value()) && (voted_for.value() != from)) + auto voted_for_other = + (voted_for.has_value()) && (voted_for.value() != from); + if ((election_type == ElectionType::RegularVote) && voted_for_other) { // Reply false, since we already voted for someone else. RAFT_DEBUG_FMT( - "Recv request vote to {} from {}: already voted for {}", + "Recv {} to {} from {}: already voted for {}", + r.msg, state->node_id, from, voted_for.value()); - send_request_vote_response(from, false); - return; + grant_vote = false; } // If the candidate's committable log is at least as up-to-date as ours, @@ -1756,56 +1811,72 @@ namespace aft const auto last_committable_idx = last_committable_index(); const auto term_of_last_committable_idx = get_term_internal(last_committable_idx); - - const auto answer = + const auto log_up_to_date = (r.term_of_last_committable_idx > term_of_last_committable_idx) || ((r.term_of_last_committable_idx == term_of_last_committable_idx) && (r.last_committable_idx >= last_committable_idx)); - - if (answer) + if (!log_up_to_date) { - // If we grant our vote, we also acknowledge that an election is in - // progress. - restart_election_timeout(); - leader_id.reset(); - voted_for = from; - } - else - { - RAFT_INFO_FMT( - "Voting against candidate at {}.{} because local state is at {}.{}", + RAFT_DEBUG_FMT( + "Recv {} to {} from {}: candidate log {}.{} is not up-to-date " + "with ours {}.{}", + r.msg, + state->node_id, + from, r.term_of_last_committable_idx, r.last_committable_idx, term_of_last_committable_idx, last_committable_idx); + grant_vote = false; } - send_request_vote_response(from, answer); - } + if (grant_vote && election_type == ElectionType::RegularVote) + { + // If we grant our vote to a candidate, then an election is in progress + restart_election_timeout(); + leader_id.reset(); + voted_for = from; + } - void send_request_vote_response(const ccf::NodeId& to, bool answer) - { RAFT_INFO_FMT( - "Send request vote response from {} to {}: {}", + "Recv {} to {} from {}: {} vote to candidate at {}.{} with " + "local state at {}.{}", + r.msg, state->node_id, - to, - answer); + from, + grant_vote ? "granted" : "denied", + r.term_of_last_committable_idx, + r.last_committable_idx, + term_of_last_committable_idx, + last_committable_idx); - RequestVoteResponse response{ - .term = state->current_view, .vote_granted = answer}; + send_request_vote_response(from, grant_vote, election_type); + } - channels->send_authenticated( - to, ccf::NodeMsgType::consensus_msg, response); + void recv_request_vote(const ccf::NodeId& from, RequestVote r) + { + std::lock_guard guard(state->lock); + +#ifdef CCF_RAFT_TRACING + nlohmann::json j = {}; + j["function"] = "recv_request_vote"; + j["packet"] = r; + j["state"] = *state; + COMMITTABLE_INDICES(j["state"], state); + j["from_node_id"] = from; + RAFT_TRACE_JSON_OUT(j); +#endif + + recv_request_vote_unsafe(from, r, ElectionType::RegularVote); } - void recv_request_vote_response( - const ccf::NodeId& from, RequestVoteResponse r) + void recv_request_pre_vote(const ccf::NodeId& from, RequestPreVote r) { std::lock_guard guard(state->lock); #ifdef CCF_RAFT_TRACING nlohmann::json j = {}; - j["function"] = "recv_request_vote_response"; + j["function"] = "recv_request_vote"; j["packet"] = r; j["state"] = *state; COMMITTABLE_INDICES(j["state"], state); @@ -1813,31 +1884,84 @@ namespace aft RAFT_TRACE_JSON_OUT(j); #endif - if (state->leadership_state != ccf::kv::LeadershipState::Candidate) + // A pre-vote is a speculative request vote, so we translate it back to a + // RequestVote to avoid duplicating the logic. + RequestVote rv{ + .term = r.term, + .last_committable_idx = r.last_committable_idx, + .term_of_last_committable_idx = r.term_of_last_committable_idx, + }; + rv.msg = RaftMsgType::raft_request_pre_vote; + recv_request_vote_unsafe(from, rv, ElectionType::PreVote); + } + + void send_request_vote_response( + const ccf::NodeId& to, bool answer, ElectionType election_type) + { + if (election_type == ElectionType::RegularVote) { + RequestVoteResponse response{ + .term = state->current_view, .vote_granted = answer}; + RAFT_INFO_FMT( - "Recv request vote response to {} from: {}: we aren't a candidate", + "Send {} from {} to {}: {}", + response.msg, state->node_id, - from); - return; + to, + answer); + + channels->send_authenticated( + to, ccf::NodeMsgType::consensus_msg, response); + } + else + { + RequestPreVoteResponse response{ + .term = state->current_view, .vote_granted = answer}; + + RAFT_INFO_FMT( + "Send {} from {} to {}: {}", + response.msg, + state->node_id, + to, + answer); + + channels->send_authenticated( + to, ccf::NodeMsgType::consensus_msg, response); } + } + + void recv_request_vote_response( + const ccf::NodeId& from, + RequestVoteResponse r, + ElectionType election_type) + { + std::lock_guard guard(state->lock); + +#ifdef CCF_RAFT_TRACING + nlohmann::json j = {}; + j["function"] = "recv_request_vote_response"; + j["packet"] = r; + j["state"] = *state; + COMMITTABLE_INDICES(j["state"], state); + j["from_node_id"] = from; + RAFT_TRACE_JSON_OUT(j); +#endif // Ignore if we don't recognise the node. auto node = all_other_nodes.find(from); if (node == all_other_nodes.end()) { RAFT_INFO_FMT( - "Recv request vote response to {} from {}: unknown node", - state->node_id, - from); + "Recv {} to {} from {}: unknown node", r.msg, state->node_id, from); return; } if (state->current_view < r.term) { RAFT_INFO_FMT( - "Recv request vote response to {} from {}: their term is more recent " + "Recv {} to {} from {}: their term is more recent " "({} < {})", + r.msg, state->node_id, from, state->current_view, @@ -1856,7 +1980,53 @@ namespace aft r.term); return; } - else if (!r.vote_granted) + + if ( + state->leadership_state != ccf::kv::LeadershipState::PreVoteCandidate && + state->leadership_state != ccf::kv::LeadershipState::Candidate) + { + RAFT_INFO_FMT( + "Recv {} to {} from: {}: we aren't a candidate", + r.msg, + state->node_id, + from); + return; + } + else if ( + election_type == ElectionType::RegularVote && + state->leadership_state != ccf::kv::LeadershipState::Candidate) + { + // Stale message from previous candidacy + // Candidate(T) -> Follower(T) -> PreVoteCandidate(T) + RAFT_INFO_FMT( + "Recv {} to {} from {}: no longer a candidate in {}", + r.msg, + state->node_id, + from, + r.term); + return; + } + else if ( + election_type == ElectionType::PreVote && + state->leadership_state != ccf::kv::LeadershipState::PreVoteCandidate) + { + // To receive a PreVoteResponse, we must have been a PreVoteCandidate in + // that term. + // Since we are a Candidate for term T, we can only have transitioned + // from PreVoteCandidate for term (T-1). Since terms are monotonic this + // is impossible. + RAFT_FAIL_FMT( + "Recv {} to {} from {}: unexpected message in {} when " + "Candidate for {}", + r.msg, + state->node_id, + from, + r.term, + state->current_view); + return; + } + + if (!r.vote_granted) { // Do nothing. RAFT_INFO_FMT( @@ -1874,6 +2044,20 @@ namespace aft add_vote_for_me(from); } + void recv_request_vote_response( + const ccf::NodeId& from, RequestVoteResponse r) + { + recv_request_vote_response(from, r, ElectionType::RegularVote); + } + + void recv_request_pre_vote_response( + const ccf::NodeId& from, RequestPreVoteResponse r) + { + RequestVoteResponse rvr{.term = r.term, .vote_granted = r.vote_granted}; + rvr.msg = RaftMsgType::raft_request_pre_vote_response; + recv_request_vote_response(from, rvr, ElectionType::PreVote); + } + void recv_propose_request_vote( const ccf::NodeId& from, ProposeRequestVote r) { @@ -1917,6 +2101,47 @@ namespace aft } } + void become_pre_vote_candidate() + { + if (configurations.empty()) + { + LOG_INFO_FMT( + "Not becoming pre-vote candidate {} due to lack of a configuration.", + state->node_id); + return; + } + + state->leadership_state = ccf::kv::LeadershipState::PreVoteCandidate; + leader_id.reset(); + + reset_votes_for_me(); + restart_election_timeout(); + + RAFT_INFO_FMT( + "Becoming pre-vote candidate {}: {}", + state->node_id, + state->current_view); + +#ifdef CCF_RAFT_TRACING + nlohmann::json j = {}; + j["function"] = "become_pre_vote_candidate"; + j["state"] = *state; + COMMITTABLE_INDICES(j["state"], state); + j["configurations"] = configurations; + RAFT_TRACE_JSON_OUT(j); +#endif + + add_vote_for_me(state->node_id); + + // Request votes only go to nodes in configurations, since only + // their votes can be tallied towards an election quorum. + for (auto const& node_id : other_nodes_in_active_configs()) + { + // ccfraft!RequestVote + send_request_pre_vote(node_id); + } + } + // ccfraft!Timeout void become_candidate() { @@ -2198,7 +2423,7 @@ namespace aft votes_for_me[conf.idx].quorum); } - // We need a quorum of votes in _all_ configurations to become leader + // We need a quorum of votes in _all_ configurations bool is_elected = true; for (auto const& v : votes_for_me) { @@ -2214,7 +2439,19 @@ namespace aft if (is_elected) { - become_leader(); + switch (state->leadership_state) + { + case ccf::kv::LeadershipState::PreVoteCandidate: + become_candidate(); + break; + case ccf::kv::LeadershipState::Candidate: + become_leader(); + break; + default: + throw std::logic_error( + "add_vote_for_me() called while not a pre-vote candidate or " + "candidate"); + } } } diff --git a/src/consensus/aft/raft_types.h b/src/consensus/aft/raft_types.h index b62fb8d833b0..803d24ba09c8 100644 --- a/src/consensus/aft/raft_types.h +++ b/src/consensus/aft/raft_types.h @@ -102,6 +102,8 @@ namespace aft raft_request_vote, raft_request_vote_response, raft_propose_request_vote, + raft_request_pre_vote, + raft_request_pre_vote_response, }; DECLARE_JSON_ENUM( RaftMsgType, @@ -114,6 +116,9 @@ namespace aft {RaftMsgType::raft_request_vote, "raft_request_vote"}, {RaftMsgType::raft_request_vote_response, "raft_request_vote_response"}, {RaftMsgType::raft_propose_request_vote, "raft_propose_request_vote"}, + {RaftMsgType::raft_request_pre_vote, "raft_request_pre_vote"}, + {RaftMsgType::raft_request_pre_vote_response, + "raft_request_pre_vote_response"}, }); #pragma pack(push, 1) @@ -183,6 +188,16 @@ namespace aft DECLARE_JSON_REQUIRED_FIELDS( AppendEntriesResponse, term, last_log_idx, success); + enum ElectionType + { + PreVote = 0, + RegularVote = 1 + }; + DECLARE_JSON_ENUM( + ElectionType, + {{ElectionType::PreVote, "PreVote"}, + {ElectionType::RegularVote, "RegularVote"}}); + DECLARE_JSON_TYPE(RaftHeader) DECLARE_JSON_REQUIRED_FIELDS(RaftHeader, msg) struct RequestVote : RaftHeader @@ -195,6 +210,19 @@ namespace aft DECLARE_JSON_REQUIRED_FIELDS( RequestVote, term, last_committable_idx, term_of_last_committable_idx); + DECLARE_JSON_TYPE(RaftHeader); + DECLARE_JSON_REQUIRED_FIELDS(RaftHeader, msg); + struct RequestPreVote : RaftHeader + { + Term term; + Index last_committable_idx; + Term term_of_last_committable_idx; + }; + DECLARE_JSON_TYPE_WITH_BASE( + RequestPreVote, RaftHeader); + DECLARE_JSON_REQUIRED_FIELDS( + RequestPreVote, term, last_committable_idx, term_of_last_committable_idx); + DECLARE_JSON_TYPE(RaftHeader) DECLARE_JSON_REQUIRED_FIELDS(RaftHeader, msg) struct RequestVoteResponse : RaftHeader @@ -206,6 +234,17 @@ namespace aft RequestVoteResponse, RaftHeader); DECLARE_JSON_REQUIRED_FIELDS(RequestVoteResponse, term, vote_granted); + DECLARE_JSON_TYPE(RaftHeader) + DECLARE_JSON_REQUIRED_FIELDS(RaftHeader, msg) + struct RequestPreVoteResponse : RaftHeader + { + Term term; + bool vote_granted; + }; + DECLARE_JSON_TYPE_WITH_BASE( + RequestPreVoteResponse, RaftHeader); + DECLARE_JSON_REQUIRED_FIELDS(RequestPreVoteResponse, term, vote_granted); + DECLARE_JSON_TYPE(RaftHeader) DECLARE_JSON_REQUIRED_FIELDS(RaftHeader, msg) struct ProposeRequestVote : RaftHeader @@ -220,3 +259,59 @@ namespace aft #pragma pack(pop) } + +FMT_BEGIN_NAMESPACE +template <> +struct formatter +{ + template + constexpr auto parse(ParseContext& ctx) + { + return ctx.begin(); + } + + template + auto format(const aft::RaftMsgType& msg_type, FormatContext& ctx) const + -> decltype(ctx.out()) + { + switch (msg_type) + { + case (aft::RaftMsgType::raft_append_entries): + { + return fmt::format_to(ctx.out(), "append_entries"); + } + case (aft::RaftMsgType::raft_append_entries_response): + { + return fmt::format_to(ctx.out(), "append_entries_response"); + } + case (aft::RaftMsgType::raft_append_entries_signed_response): + { + return fmt::format_to(ctx.out(), "append_entries_signed_response"); + } + case (aft::RaftMsgType::raft_request_vote): + { + return fmt::format_to(ctx.out(), "request_vote"); + } + case (aft::RaftMsgType::raft_request_vote_response): + { + return fmt::format_to(ctx.out(), "request_vote_response"); + } + case (aft::RaftMsgType::raft_propose_request_vote): + { + return fmt::format_to(ctx.out(), "propose_request_vote"); + } + case (aft::RaftMsgType::raft_request_pre_vote): + { + return fmt::format_to(ctx.out(), "request_pre_vote"); + } + case (aft::RaftMsgType::raft_request_pre_vote_response): + { + return fmt::format_to(ctx.out(), "request_pre_vote_response"); + } + default: + throw std::runtime_error( + fmt::format("Unhandled RaftMsgType: {}", uint64_t(msg_type))); + } + } +}; +FMT_END_NAMESPACE diff --git a/src/consensus/aft/test/driver.cpp b/src/consensus/aft/test/driver.cpp index 1bae98be99a1..b7c8f3419448 100644 --- a/src/consensus/aft/test/driver.cpp +++ b/src/consensus/aft/test/driver.cpp @@ -92,6 +92,26 @@ int main(int argc, char** argv) switch (shash(in)) { + case shash("pre_vote_enabled"): + { + assert(items.size() == 2); + if (items[1] == "true") + { + driver->set_pre_vote_enabled(true); + } + else if (items[1] == "false") + { + driver->set_pre_vote_enabled(false); + } + else + { + throw std::runtime_error(fmt::format( + "pre_vote_enabled value must be true or false on line " + "{}", + lineno)); + } + break; + } case shash("start_node"): assert(items.size() == 2); driver->create_start_node(items[1], lineno); @@ -275,6 +295,15 @@ int main(int argc, char** argv) assert(items.size() == 4); driver->assert_detail(items[1], items[2], items[3], false, lineno); break; + case shash("assert_config"): + assert(items.size() >= 3); + driver->assert_config( + items[1], items[2], {std::next(items.begin(), 3), items.end()}); + break; + case shash("assert_absent_config"): + assert(items.size() == 3); + driver->assert_absent_config(items[1], items[2]); + break; case shash("replicate_new_configuration"): assert(items.size() >= 3); items.erase(items.begin()); diff --git a/src/consensus/aft/test/driver.h b/src/consensus/aft/test/driver.h index 3eacddc14e7d..d54576293698 100644 --- a/src/consensus/aft/test/driver.h +++ b/src/consensus/aft/test/driver.h @@ -4,6 +4,7 @@ #include "ccf/ds/logger.h" #include "consensus/aft/raft.h" +#include "consensus/aft/raft_types.h" #include "logging_stub.h" #include @@ -116,6 +117,7 @@ class RaftDriver std::shared_ptr raft; }; + bool pre_vote_enabled = false; std::map _nodes; std::set> _connections; @@ -197,7 +199,7 @@ class RaftDriver std::make_unique(kv), std::make_unique(node_id), std::make_shared(), - std::make_shared(node_id), + std::make_shared(node_id, pre_vote_enabled), nullptr); kv->set_set_retired_committed_hook( [raft](aft::Index idx, const std::vector& node_ids) { @@ -389,6 +391,21 @@ class RaftDriver "{}--{}{}: {}", first, (dropped ? "X" : ">>"), second, message); } + void log_msg_details( + ccf::NodeId node_id, + ccf::NodeId tgt_node_id, + aft::RequestPreVote rv, + bool dropped) + { + const auto s = fmt::format( + "{} for term {}, at tx {}.{}", + rv.msg, + rv.term, + rv.term_of_last_committable_idx, + rv.last_committable_idx); + log(node_id, tgt_node_id, s, dropped); + } + void log_msg_details( ccf::NodeId node_id, ccf::NodeId tgt_node_id, @@ -396,13 +413,25 @@ class RaftDriver bool dropped) { const auto s = fmt::format( - "request_vote for term {}, at tx {}.{}", + "{} for term {}, at tx {}.{}", + rv.msg, rv.term, rv.term_of_last_committable_idx, rv.last_committable_idx); log(node_id, tgt_node_id, s, dropped); } + void log_msg_details( + ccf::NodeId node_id, + ccf::NodeId tgt_node_id, + aft::RequestPreVoteResponse rv, + bool dropped) + { + const auto s = fmt::format( + "{} for term {} = {}", rv.msg, rv.term, (rv.vote_granted ? "Y" : "N")); + rlog(node_id, tgt_node_id, s, dropped); + } + void log_msg_details( ccf::NodeId node_id, ccf::NodeId tgt_node_id, @@ -410,9 +439,7 @@ class RaftDriver bool dropped) { const auto s = fmt::format( - "request_vote_response for term {} = {}", - rv.term, - (rv.vote_granted ? "Y" : "N")); + "{} for term {} = {}", rv.msg, rv.term, (rv.vote_granted ? "Y" : "N")); rlog(node_id, tgt_node_id, s, dropped); } @@ -423,7 +450,8 @@ class RaftDriver bool dropped) { const auto s = fmt::format( - "append_entries ({}.{}, {}.{}] (term {}, commit {})", + "{} ({}.{}, {}.{}] (term {}, commit {})", + ae.msg, ae.prev_term, ae.prev_idx, ae.term_of_idx, @@ -454,10 +482,7 @@ class RaftDriver } } const auto s = fmt::format( - "append_entries_response {} for {}.{}", - success, - aer.term, - aer.last_log_idx); + "{} {} for {}.{}", aer.msg, success, aer.term, aer.last_log_idx); rlog(node_id, tgt_node_id, s, dropped); } @@ -467,7 +492,7 @@ class RaftDriver aft::ProposeRequestVote prv, bool dropped) { - const auto s = fmt::format("propose_request_vote for term {}", prv.term); + const auto s = fmt::format("{} for term {}", prv.msg, prv.term); log(node_id, tgt_node_id, s, dropped); } @@ -492,7 +517,21 @@ class RaftDriver log_msg_details(node_id, tgt_node_id, rv, dropped); break; } + case (aft::RaftMsgType::raft_request_pre_vote): + { + auto rpv = *(aft::RequestPreVote*)data; + packet = rpv; + log_msg_details(node_id, tgt_node_id, rpv, dropped); + break; + } case (aft::RaftMsgType::raft_request_vote_response): + { + auto rvr = *(aft::RequestPreVoteResponse*)data; + packet = rvr; + log_msg_details(node_id, tgt_node_id, rvr, dropped); + break; + } + case (aft::RaftMsgType::raft_request_pre_vote_response): { auto rvr = *(aft::RequestVoteResponse*)data; packet = rvr; @@ -997,6 +1036,11 @@ class RaftDriver } } + void set_pre_vote_enabled(bool enabled) + { + pre_vote_enabled = enabled; + } + using Discrepancies = std::map>; Discrepancies check_state_sync(const std::map nodes) @@ -1280,6 +1324,15 @@ class RaftDriver } } + std::string detail_to_string(const nlohmann::json& detail) + { + if (detail.is_string()) + { + return detail.get(); + } + return detail.dump(); + } + void assert_detail( ccf::NodeId node_id, const std::string& detail, @@ -1300,7 +1353,7 @@ class RaftDriver std::to_string((int)lineno))); } - std::string value = d[detail]; + std::string value = detail_to_string(d[detail]); if (equal ? (value != expected) : (value == expected)) { std::string cmp = equal ? "!" : "="; @@ -1321,4 +1374,58 @@ class RaftDriver std::to_string((int)lineno))); } } + + void assert_config( + const std::string& node_id, + const std::string& config_idx_s, + const std::vector& node_ids) + { + auto config_idx = std::stol(config_idx_s); + auto details = _nodes.at(node_id).raft->get_details(); + for (const auto& config : details.configs) + { + if (config.idx == config_idx) + { + std::set expected_nodes(node_ids.begin(), node_ids.end()); + std::set actual_nodes; + for (const auto& [id, _] : config.nodes) + { + actual_nodes.insert(id); + } + if (expected_nodes != actual_nodes) + { + auto actual_str = + fmt::format("{{{}}}", fmt::join(actual_nodes, ", ")); + auto expected_str = + fmt::format("{{{}}}", fmt::join(expected_nodes, ", ")); + throw std::runtime_error(fmt::format( + "Node {} configuration at idx {} ({}) does not match expected ({})", + node_id, + config_idx, + actual_str, + expected_str)); + } + return; + } + } + throw std::runtime_error(fmt::format( + "Node {} does not have a configuration at idx {}", node_id, config_idx)); + } + + void assert_absent_config( + const std::string& node_id, const std::string& config_idx_s) + { + auto config_idx = std::stol(config_idx_s); + auto details = _nodes.at(node_id).raft->get_details(); + for (const auto& config : details.configs) + { + if (config.idx == config_idx) + { + throw std::runtime_error(fmt::format( + "Node {} has unexpected configuration at idx {}", + node_id, + config_idx)); + } + } + } }; \ No newline at end of file diff --git a/src/kv/kv_types.h b/src/kv/kv_types.h index a6a69dd0a9da..86f57ba50ba9 100644 --- a/src/kv/kv_types.h +++ b/src/kv/kv_types.h @@ -140,6 +140,7 @@ namespace ccf::kv None, Leader, Follower, + PreVoteCandidate, Candidate, }; @@ -148,6 +149,7 @@ namespace ccf::kv {{LeadershipState::None, "None"}, {LeadershipState::Leader, "Leader"}, {LeadershipState::Follower, "Follower"}, + {LeadershipState::PreVoteCandidate, "PreVoteCandidate"}, {LeadershipState::Candidate, "Candidate"}}); enum class MembershipState diff --git a/src/node/rpc/node_frontend.h b/src/node/rpc/node_frontend.h index 3f5f10cfd5d6..9143a0bc5f8b 100644 --- a/src/node/rpc/node_frontend.h +++ b/src/node/rpc/node_frontend.h @@ -424,7 +424,7 @@ namespace ccf openapi_info.description = "This API provides public, uncredentialed access to service and node " "state."; - openapi_info.document_version = "4.13.0"; + openapi_info.document_version = "4.14.0"; } void init_handlers() override diff --git a/tests/raft_scenarios/check_quorum_012_0 b/tests/raft_scenarios/check_quorum_012_0 new file mode 100644 index 000000000000..8388d00945ab --- /dev/null +++ b/tests/raft_scenarios/check_quorum_012_0 @@ -0,0 +1,48 @@ +start_node,0 +emit_signature,2 +swap_nodes,2,in,1,2 +emit_signature,2 + +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all + +# All nodes believe that the only configuration is {0,1,2} +assert_state_sync +assert_absent_config,0,1 +assert_config,0,3,0,1,2 + +disconnect_node,0 +swap_nodes,2,out,1,2 +emit_signature,2 + +# Only node 0 knows about config {0}, as it is partitioned from nodes 1 and 2 +assert_config,0,3,0,1,2 +assert_config,0,5,0 + +assert_config,1,3,0,1,2 +assert_absent_config,1,5 +assert_config,2,3,0,1,2 +assert_absent_config,2,5 + +# Node 0 should step down via CheckQuorum +periodic_one,0,100 +assert_detail,0,leadership_state,Follower + +periodic_one,1,100 +assert_detail,1,leadership_state,Candidate + +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all + +assert_detail,0,leadership_state,Follower +assert_detail,1,leadership_state,Leader \ No newline at end of file diff --git a/tests/raft_scenarios/check_quorum_0_012 b/tests/raft_scenarios/check_quorum_0_012 new file mode 100644 index 000000000000..04b8e1ba6043 --- /dev/null +++ b/tests/raft_scenarios/check_quorum_0_012 @@ -0,0 +1,48 @@ +start_node,0 +emit_signature,2 +swap_nodes,2,in,1,2 +emit_signature,2 + +periodic_one,0,10 +dispatch_all +dispatch_all +periodic_one,0,10 +dispatch_all + + +# All nodes have both configs: {0} and {0,1,2} +assert_commit_idx,0,4 +assert_absent_config,0,1 +assert_config,0,3,0,1,2 + +assert_commit_idx,1,2 +assert_config,1,1,0 +assert_config,1,3,0,1,2 + +assert_commit_idx,2,2 +assert_config,2,1,0 +assert_config,2,3,0,1,2 + +# Node 0 should step down via CheckQuorum +disconnect_node,0 +periodic_one,0,100 +assert_detail,0,leadership_state,Follower + +# Node 1 should try to start an election but fails without a quorum in {0} +periodic_one,1,100 +assert_detail,1,leadership_state,Candidate + +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all + +assert_detail,1,leadership_state,Candidate + +# Fixup the network to allow a leader to be elected and the test to pass +reconnect_node,0 +periodic_all,100 +dispatch_all +periodic_all,10 +dispatch_all \ No newline at end of file diff --git a/tests/raft_scenarios/pre_vote b/tests/raft_scenarios/pre_vote new file mode 100644 index 000000000000..45058f8dc5c2 --- /dev/null +++ b/tests/raft_scenarios/pre_vote @@ -0,0 +1,94 @@ +pre_vote_enabled,true +start_node,0 +emit_signature,2 +swap_nodes,2,in,1,2 +emit_signature,2 + +periodic_one,0,10 +dispatch_all +periodic_one,0,10 +dispatch_all +periodic_one,0,10 +dispatch_all + +# All nodes just have config {0,1,2} +assert_commit_idx,0,4 +assert_absent_config,0,1 +assert_config,0,3,0,1,2 + +assert_commit_idx,1,4 +assert_absent_config,1,1 +assert_config,1,3,0,1,2 + +assert_commit_idx,2,4 +assert_absent_config,2,1 +assert_config,2,3,0,1,2 + +# Node 1 steps up as a pre-vote candidate +periodic_one,1,100 +assert_detail,1,leadership_state,PreVoteCandidate + +# Node 2 responds to pre-vote request, +# but does not update its term +dispatch_single,1,2 +assert_detail,2,leadership_state,Follower +assert_detail,2,current_view,2 + +# Node 1 becomes candidate and then leader for term 3 +dispatch_single,2,1 +assert_detail,1,leadership_state,Candidate +assert_detail,1,current_view,3 +dispatch_single,1,2 +dispatch_single,2,1 +assert_detail,1,leadership_state,Leader + +# Node 0 is unable to become a candidate +# Node 0 steps down via CheckQuorum +periodic_one,0,100 +assert_detail,0,leadership_state,Follower +periodic_one,0,100 +assert_detail,0,leadership_state,PreVoteCandidate +assert_detail,0,current_view,2 + +# Upon receiving AppendEntries from Node 1, +# Node 0 becomes a follower in view 3 +dispatch_single,1,0 +dispatch_single,1,0 +assert_detail,0,current_view,3 +assert_detail,0,leadership_state,Follower + +dispatch_all +periodic_one,0,10 +dispatch_all + +assert_commit_idx,0,4 +assert_detail,0,current_view,3 +assert_commit_idx,1,4 +assert_detail,1,current_view,3 +assert_commit_idx,2,4 +assert_detail,2,current_view,3 + +# PreVotes can be denied if they are from an older log +emit_signature,3 +periodic_one,1,10 +dispatch_single,1,2 + +# Node 0 becomes a prevote candidate but cannot be elected +periodic_one,0,100 +assert_detail,0,leadership_state,PreVoteCandidate +assert_detail,0,current_view,3 +summarise_log,0 +# n[0]: 2.1, 2.2, 2.3, [2.4] (ts=351) +assert_detail,2,leadership_state,Follower +summarise_log,2 +# n[2]: 2.1, 2.2, 2.3, [2.4], 3.5 (ts=351) + +dispatch_single,0,2 +dispatch_single,2,0 + +# Node 0 is still a PreVoteCandidate as Node 2 denied its pre-vote request +assert_detail,0,leadership_state,PreVoteCandidate + +# Node 0 will fall back to a follower when it receives an append entries +dispatch_single,1,0 +assert_detail,0,leadership_state,Follower \ No newline at end of file diff --git a/tests/raft_scenarios/pre_vote_split b/tests/raft_scenarios/pre_vote_split new file mode 100644 index 000000000000..644dddaa1af6 --- /dev/null +++ b/tests/raft_scenarios/pre_vote_split @@ -0,0 +1,59 @@ +pre_vote_enabled,false +start_node,0 +emit_signature,2 +swap_nodes,2,in,1 +emit_signature,2 + +pre_vote_enabled,true +swap_nodes,2,in,2 +emit_signature,2 + +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all +periodic_all,10 +dispatch_all + +assert_commit_idx,0,6 +assert_config,0,5,0,1,2 +assert_detail,0,leadership_state,Leader +assert_commit_idx,1,6 +assert_config,1,5,0,1,2 +assert_detail,1,leadership_state,Follower +assert_commit_idx,2,6 +assert_config,2,5,0,1,2 +assert_detail,2,leadership_state,Follower + + +# Node 2 (Pre-Vote enabled) can be elected +disconnect_node,0 +periodic_one,2,100 +assert_detail,2,leadership_state,PreVoteCandidate +assert_detail,2,current_view,2 + +dispatch_single,2,1 +dispatch_single,1,2 + +assert_detail,2,leadership_state,Candidate +assert_detail,2,current_view,3 + +dispatch_single,2,1 +dispatch_single,1,2 + +assert_detail,2,leadership_state,Leader + +loop_until_sync + +# Node 1 (Pre-Vote disabled) can be elected + +disconnect_node,0 +periodic_one,1,100 +assert_detail,1,leadership_state,Candidate +assert_detail,1,current_view,4 + +dispatch_single,1,2 +dispatch_single,2,1 + +assert_detail,1,leadership_state,Leader +assert_detail,1,current_view,4 \ No newline at end of file diff --git a/tests/raft_scenarios_runner.py b/tests/raft_scenarios_runner.py index 3208024efd20..12ed7493b776 100644 --- a/tests/raft_scenarios_runner.py +++ b/tests/raft_scenarios_runner.py @@ -125,6 +125,13 @@ def expand_files(files): parser.add_argument("driver", type=str, help="Path to raft_driver binary") parser.add_argument("--gen-scenarios", action="store_true") parser.add_argument("files", nargs="*", type=str, help="Path to scenario files") + parser.add_argument( + "-o", + "--output", + type=str, + help="Output directory", + default=os.path.join("consensus"), + ) args = parser.parse_args() @@ -139,8 +146,7 @@ def expand_files(files): ostream = sys.stdout # Create consensus-specific output directory - output_dir = os.path.join("consensus") - os.makedirs(output_dir, exist_ok=True) + os.makedirs(args.output, exist_ok=True) for scenario in files: ostream.write("## {}\n\n".format(os.path.basename(scenario))) @@ -172,7 +178,7 @@ def expand_files(files): ## Do not create an empty ndjson file if log is emtpy. if log: with open( - os.path.join(output_dir, f"{os.path.basename(scenario)}.ndjson"), + os.path.join(args.output, f"{os.path.basename(scenario)}.ndjson"), "w", encoding="utf-8", ) as f: diff --git a/tla/consensus/MCAliases.tla b/tla/consensus/MCAliases.tla index ba5769047119..8e9ac676e5fa 100644 --- a/tla/consensus/MCAliases.tla +++ b/tla/consensus/MCAliases.tla @@ -64,6 +64,7 @@ DebugAliasAggregates == DebugAliasVars == [ + preVoteStatus |-> preVoteStatus, configurations |-> configurations, messages |-> messages, currentTerm |-> currentTerm, @@ -72,7 +73,7 @@ DebugAliasVars == votedFor |-> votedFor, hasJoined |-> hasJoined, \* More compact visualization of the log. -\* lg |-> [ s \in Servers |-> StringifyLog(s) ], + \* lg |-> [ s \in Servers |-> StringifyLog(s) ], log |-> log, commitIndex |-> commitIndex, votesGranted |-> votesGranted, @@ -83,10 +84,10 @@ DebugAliasVars == ] DebugAlias == - [ _format |-> B[srv] \o "/\\ %1$s = %2$s\n" ] - @@ - DebugAliasAggregates - @@ + \*[ _format |-> B[srv] \o "/\\ %1$s = %2$s\n" ] + \* @@ + \*DebugAliasAggregates + \* @@ DebugAliasGlobals @@ DebugAliasVars diff --git a/tla/consensus/MCccfraft.cfg b/tla/consensus/MCccfraft.cfg index 0c5e7093a9b3..f4fe83056c1b 100644 --- a/tla/consensus/MCccfraft.cfg +++ b/tla/consensus/MCccfraft.cfg @@ -15,10 +15,15 @@ CONSTANTS Nil = Nil Follower = L_Follower + PreVoteCandidate = L_PreVoteCandidate Candidate = L_Candidate Leader = L_Leader None = L_None + PreVoteDisabled = PV_PreVoteDisabled + PreVoteCapable = PV_PreVoteCapable + PreVoteEnabled = PV_PreVoteEnabled + Active = R_Active RetirementOrdered = R_RetirementOrdered RetirementSigned = R_RetirementSigned diff --git a/tla/consensus/MCccfraft.tla b/tla/consensus/MCccfraft.tla index 436cdf23aadf..7aa089fae3fa 100644 --- a/tla/consensus/MCccfraft.tla +++ b/tla/consensus/MCccfraft.tla @@ -106,10 +106,17 @@ MCSend(msg) == /\ n.type = AppendEntriesResponse /\ CCF!Send(msg) +MCInitPreVoteStatus == + /\ PreVoteStatusTypeInv + /\ ~\E i, j \in Servers: + /\ PreVoteDisabled \in preVoteStatus[i] + /\ PreVoteEnabled \in preVoteStatus[j] + MCInit == /\ InitMessagesVars /\ InitCandidateVars /\ InitLeaderVars + /\ MCInitPreVoteStatus /\ IF Cardinality(Configurations[1]) = 1 \* If the first config is just one node, we can start with a two-tx log and a single config. THEN InitLogConfigServerVars(Configurations[1], StartLog) diff --git a/tla/consensus/SIMccfraft.cfg b/tla/consensus/SIMccfraft.cfg index 26d2b1f51e06..4a55e1f4e3b6 100644 --- a/tla/consensus/SIMccfraft.cfg +++ b/tla/consensus/SIMccfraft.cfg @@ -6,10 +6,15 @@ CONSTANTS Nil = Nil Follower = L_Follower + PreVoteCandidate = L_PreVoteCandidate Candidate = L_Candidate Leader = L_Leader None = L_None + PreVoteDisabled = PV_PreVoteDisabled + PreVoteCapable = PV_PreVoteCapable + PreVoteEnabled = PV_PreVoteEnabled + Active = R_Active RetirementOrdered = R_RetirementOrdered RetirementSigned = R_RetirementSigned @@ -51,6 +56,8 @@ CONSTANTS Extend <- [abs]ABSExtend CopyMaxAndExtend <- [abs]ABSCopyMaxAndExtend + InitPreVoteStatus <- SIMInitPreVoteStatus + _PERIODIC Periodically @@ -71,6 +78,8 @@ PROPERTIES LeaderProp LogMatchingProp + VotesGrantedMonotonicProp + \* ALIAS \* \* DebugAlias \* \* DebugActingServerAlias @@ -115,4 +124,5 @@ INVARIANTS \* DebugInvAllMessagesProcessable \* DebugInvRetirementReachable \* DebugInvUpToDepth - \* DebugMoreUpToDateCorrectInv \ No newline at end of file + \* DebugMoreUpToDateCorrectInv + \* DebugNonTrivialLeaderInv \ No newline at end of file diff --git a/tla/consensus/SIMccfraft.tla b/tla/consensus/SIMccfraft.tla index d9c234e89cf6..229f6aa3cce8 100644 --- a/tla/consensus/SIMccfraft.tla +++ b/tla/consensus/SIMccfraft.tla @@ -16,6 +16,12 @@ SIMInitReconfigurationVars == \* Start with any subset of servers in the active configuration. \/ CCF!InitReconfigurationVars +SIMInitPreVoteStatus == + /\ PreVoteStatusTypeInv + /\ ~\E i, j \in Servers: + /\ PreVoteDisabled \in preVoteStatus[i] + /\ PreVoteEnabled \in preVoteStatus[j] + LOCAL R == 1..IF "R" \in DOMAIN IOEnv THEN atoi(IOEnv.R) ELSE 10 @@ -66,6 +72,7 @@ SIMFairness == /\ \A s \in Servers : WF_vars(SignCommittableMessages(s)) /\ \A s \in Servers : WF_vars(AdvanceCommitIndex(s)) /\ \A s \in Servers : WF_vars(AppendRetiredCommitted(s)) + /\ \A s \in Servers : WF_vars(PreVoteEnabled \in preVoteStatus[s] /\ BecomeCandidate(s)) /\ \A s \in Servers : WF_vars(BecomeLeader(s)) \* The following fairness conditions reference the original CCF actions \* and, thus, do not include the RandomElement conjunct. diff --git a/tla/consensus/Traceccfraft.cfg b/tla/consensus/Traceccfraft.cfg index 0abd26c8569d..0e7cac20ccf5 100644 --- a/tla/consensus/Traceccfraft.cfg +++ b/tla/consensus/Traceccfraft.cfg @@ -54,13 +54,20 @@ CONSTANTS InitReconfigurationVars <- TraceInitReconfigurationVars + InitPreVoteStatus <- TraceInitPreVoteStatus + Nil = Nil Follower = L_Follower + PreVoteCandidate = L_PreVoteCandidate Candidate = L_Candidate Leader = L_Leader None = L_None + PreVoteDisabled = PV_PreVoteDisabled + PreVoteCapable = PV_PreVoteCapable + PreVoteEnabled = PV_PreVoteEnabled + Active = R_Active RetirementOrdered = R_RetirementOrdered RetirementSigned = R_RetirementSigned diff --git a/tla/consensus/Traceccfraft.tla b/tla/consensus/Traceccfraft.tla index 79c945aa75e1..a9dd2523ddd1 100644 --- a/tla/consensus/Traceccfraft.tla +++ b/tla/consensus/Traceccfraft.tla @@ -4,13 +4,15 @@ EXTENDS ccfraft, Json, IOUtils, Sequences, MCAliases \* raft_types.h enum RaftMsgType RaftMsgType == "raft_append_entries" :> AppendEntriesRequest @@ "raft_append_entries_response" :> AppendEntriesResponse @@ - "raft_request_vote" :> RequestVoteRequest @@ "raft_request_vote_response" :> RequestVoteResponse @@ + "raft_request_vote" :> RequestVoteRequest @@ "raft_request_pre_vote" :> RequestVoteRequest @@ + "raft_request_vote_response" :> RequestVoteResponse @@ "raft_request_pre_vote_response" :> RequestVoteResponse @@ "raft_propose_request_vote" :> ProposeVoteRequest ToLeadershipState == - "Leader" :> Leader @@ "Follower" :> Follower @@ + "PreVoteCandidate" :> PreVoteCandidate @@ "Candidate" :> Candidate @@ + "Leader" :> Leader @@ "None" :> None ToMembershipState == @@ -55,6 +57,9 @@ IsRequestVoteRequest(msg, dst, src, logline) == /\ IsHeader(msg, dst, src, logline, RequestVoteRequest) /\ msg.lastCommittableIndex = logline.msg.packet.last_committable_idx /\ msg.lastCommittableTerm = logline.msg.packet.term_of_last_committable_idx + /\ IF logline.msg.packet.msg = "raft_request_vote" + THEN msg.isPreVote = FALSE + ELSE msg.isPreVote = TRUE IsRequestVoteResponse(msg, dst, src, logline) == /\ IsHeader(msg, dst, src, logline, RequestVoteResponse) @@ -111,6 +116,8 @@ TraceAppendEntriesBatchsize(i, j) == TraceInitReconfigurationVars == /\ InitLogConfigServerVars({TraceLog[1].msg.state.node_id}, StartLog) +TraceInitPreVoteStatus == PreVoteStatusTypeInv + ------------------------------------------------------------------------------------- VARIABLE l, ts @@ -149,12 +156,26 @@ IsDropPendingTo == /\ IsEvent("drop_pending_to") /\ Network!DropMessage(logline.msg.to_node_id, LAMBDA msg: IsMessage(msg, logline.msg.to_node_id, logline.msg.from_node_id, logline)) - /\ UNCHANGED <> + /\ UNCHANGED <> IsTimeout == + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) + /\ \/ /\ IsEvent("become_pre_vote_candidate") + /\ logline.msg.state.leadership_state = "PreVoteCandidate" + \/ /\ IsEvent("become_candidate") + /\ logline.msg.state.leadership_state = "Candidate" + /\ Timeout(logline.msg.state.node_id) + /\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id) + /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx + /\ leadershipState'[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] + /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + +IsBecomeCandidate == /\ IsEvent("become_candidate") /\ logline.msg.state.leadership_state = "Candidate" - /\ Timeout(logline.msg.state.node_id) + /\ logline.msg.state.pre_vote_enabled /\ PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id] + /\ BecomeCandidate(logline.msg.state.node_id) /\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id) /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ leadershipState'[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] @@ -170,6 +191,7 @@ IsBecomeLeader == /\ leadershipState'[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsClientRequest == /\ IsEvent("replicate") @@ -181,6 +203,7 @@ IsClientRequest == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsCleanupNodes == /\ IsEvent("replicate") @@ -192,6 +215,7 @@ IsCleanupNodes == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsSendAppendEntries == /\ IsEvent("send_append_entries") @@ -211,6 +235,7 @@ IsSendAppendEntries == /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsRcvAppendEntriesRequest == /\ IsEvent("recv_append_entries") @@ -226,6 +251,7 @@ IsRcvAppendEntriesRequest == /\ IsAppendEntriesRequest(m, i, j, logline) /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsSendAppendEntriesResponse == \* Skip saer because ccfraft!HandleAppendEntriesRequest atomcially handles the request and sends the response. @@ -237,12 +263,14 @@ IsSendAppendEntriesResponse == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsAddConfiguration == /\ IsEvent("add_configuration") /\ leadershipState[logline.msg.state.node_id] = Follower /\ UNCHANGED vars /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsSignCommittableMessages == /\ IsEvent("replicate") @@ -259,6 +287,7 @@ IsSignCommittableMessages == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsAdvanceCommitIndex == \* This is enabled *after* a SignCommittableMessages because ACI looks for a @@ -273,11 +302,13 @@ IsAdvanceCommitIndex == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) \/ /\ IsEvent("commit") /\ UNCHANGED vars /\ logline.msg.state.leadership_state = "Follower" /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsChangeConfiguration == /\ IsEvent("add_configuration") @@ -290,6 +321,7 @@ IsChangeConfiguration == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsRcvAppendEntriesResponse == /\ IsEvent("recv_append_entries_response") @@ -311,6 +343,7 @@ IsRcvAppendEntriesResponse == /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsSendRequestVote == /\ IsEvent("send_request_vote") @@ -327,6 +360,7 @@ IsSendRequestVote == /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsRcvRequestVoteRequest == \/ /\ IsEvent("recv_request_vote") @@ -342,6 +376,7 @@ IsRcvRequestVoteRequest == /\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id) /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsExecuteAppendEntries == \* Skip append because ccfraft!HandleRequestVoteRequest atomically handles the request, sends the response, @@ -354,9 +389,11 @@ IsExecuteAppendEntries == /\ currentTerm[logline.msg.state.node_id] = logline.msg.state.current_view /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsRcvRequestVoteResponse == - /\ IsEvent("recv_request_vote_response") + /\ \/ IsEvent("recv_request_vote_response") + \/ IsEvent("recv_request_pre_vote_response") /\ LET i == logline.msg.state.node_id j == logline.msg.from_node_id IN \E m \in Network!MessagesTo(i, j): @@ -373,16 +410,18 @@ IsRcvRequestVoteResponse == /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsBecomeFollower == /\ IsEvent("become_follower") /\ UNCHANGED vars \* UNCHANGED implies that it doesn't matter if we prime the previous variables. + \* We don't assert committable and last idx here, as the spec and implementation are out of sync until + \* IsSendAppendEntriesResponse or IsSendRequestVote (in the candidate path) /\ leadershipState[logline.msg.state.node_id] # Leader - /\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id) /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] - /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsCheckQuorum == /\ IsEvent("become_follower") @@ -393,6 +432,7 @@ IsCheckQuorum == /\ leadershipState'[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) IsRcvProposeVoteRequest == /\ IsEvent("recv_propose_request_vote") @@ -402,15 +442,17 @@ IsRcvProposeVoteRequest == /\ m.type = ProposeVoteRequest /\ m.term = logline.msg.packet.term /\ Discard(m) - /\ UNCHANGED <> + /\ UNCHANGED <> /\ Range(logline.msg.state.committable_indices) \subseteq CommittableIndices(logline.msg.state.node_id) /\ commitIndex[logline.msg.state.node_id] = logline.msg.state.commit_idx /\ leadershipState[logline.msg.state.node_id] = ToLeadershipState[logline.msg.state.leadership_state] /\ membershipState[logline.msg.state.node_id] \in ToMembershipState[logline.msg.state.membership_state] /\ Len(log[logline.msg.state.node_id]) = logline.msg.state.last_idx + /\ (logline.msg.state.pre_vote_enabled => PreVoteEnabled \in preVoteStatus[logline.msg.state.node_id]) TraceNext == \/ IsTimeout + \/ IsBecomeCandidate \/ IsBecomeLeader \/ IsBecomeFollower \/ IsCheckQuorum @@ -528,6 +570,7 @@ TraceDifferentialInv == TraceAlias == DebugAlias @@ [ + l |-> l, _logline |-> TraceLog[l-1] \* Uncomment _ENABLED when debugging the enablement state of ccfraft's actions. diff --git a/tla/consensus/ccfraft.tla b/tla/consensus/ccfraft.tla index 669c6b3fd106..18ed7b0a58bb 100644 --- a/tla/consensus/ccfraft.tla +++ b/tla/consensus/ccfraft.tla @@ -35,8 +35,9 @@ CONSTANT CONSTANTS \* See original Raft paper (https://www.usenix.org/system/files/conference/atc14/atc14-paper-ongaro.pdf) \* and comments for leadership_state in ../src/consensus/aft/raft.h for details on the Follower, - \* Candidate, and Leader states. + \* PreVoteCandidate, Candidate, and Leader states. Follower, + PreVoteCandidate, Candidate, Leader, \* Initial state for a joiner node, until it has received a first message @@ -45,6 +46,7 @@ CONSTANTS LeadershipStates == { Follower, + PreVoteCandidate, Candidate, Leader, None @@ -114,6 +116,23 @@ Nil == ------------------------------------------------------------------------------ \* Global variables +CONSTANTS + \* The node is ignorant of prevote + PreVoteDisabled, + \* The node is able to respond to prevote messages + PreVoteCapable, + \* The node will become a PreVoteCandidate before becoming a Candidate + PreVoteEnabled +VARIABLE + preVoteStatus + +PreVoteStatusTypeInv == preVoteStatus \in + {F \in [Servers -> { + {PreVoteDisabled}, + {PreVoteCapable}, + {PreVoteCapable, PreVoteEnabled}}]: + /\ DOMAIN F = Servers} + \* A set representing requests and responses sent from one server \* to another. With CCF, we have message integrity and can ensure unique messages. \* Messages only records messages that are currently in-flight, actions should @@ -155,6 +174,7 @@ RequestVoteRequestTypeOK(m) == RequestVoteResponseTypeOK(m) == /\ m.type = RequestVoteResponse /\ m.voteGranted \in BOOLEAN + /\ m.isPreVote \in BOOLEAN ProposeVoteRequestTypeOK(m) == /\ m.type = ProposeVoteRequest @@ -340,6 +360,7 @@ LeaderVarsTypeInv == \* All variables; used for stuttering (asserting state hasn't changed). vars == << + preVoteStatus, reconfigurationVars, messageVars, serverVars, @@ -350,6 +371,7 @@ vars == << \* Invariant to check the type safety of all variables TypeInv == + /\ PreVoteStatusTypeInv /\ ReconfigurationVarsTypeInv /\ MessageVarsTypeInv /\ ServerVarsTypeInv @@ -615,20 +637,48 @@ InitCandidateVars == InitLeaderVars == /\ matchIndex = [i \in Servers |-> [j \in Servers |-> 0]] +InitPreVoteStatus == + /\ preVoteStatus = [i \in Servers |-> {PreVoteDisabled}] + Init == /\ InitReconfigurationVars /\ InitMessagesVars /\ InitCandidateVars /\ InitLeaderVars + /\ InitPreVoteStatus ------------------------------------------------------------------------------ \* Define state transitions +BecomePreVoteCandidate(i) == + /\ PreVoteEnabled \in preVoteStatus[i] + \* Only servers that haven't completed retirement can become candidates + /\ membershipState[i] \in (MembershipStates \ {RetiredCommitted}) + \* Only servers that are followers/candidates can become pre-vote-candidates + \* Candidates can time out and become pre-vote-candidates for the next term + /\ leadershipState[i] \in {Follower, Candidate} + /\ + \* Check that the reconfiguration which added this node is at least committable + \/ \E c \in DOMAIN configurations[i] : + /\ i \in configurations[i][c] + /\ MaxCommittableIndex(log[i]) >= c + \* Or if the node isn't in a configuration, that it is retiring + \/ i \in retirementCompleted[i] + /\ leadershipState' = [leadershipState EXCEPT ![i] = PreVoteCandidate] + /\ votesGranted' = [votesGranted EXCEPT ![i] = {i}] + /\ UNCHANGED <> + BecomeCandidate(i) == \* Only servers that haven't completed retirement can become candidates - /\ membershipState[i] \in {Active, RetirementOrdered, RetirementSigned, RetirementCompleted} + /\ membershipState[i] \in (MembershipStates \ {RetiredCommitted}) \* Only servers that are followers/candidates can become candidates - /\ leadershipState[i] \in {Follower, Candidate} + /\ IF PreVoteEnabled \notin preVoteStatus[i] + THEN leadershipState[i] \in {Follower, Candidate} + ELSE /\ leadershipState[i] = PreVoteCandidate + \* To become a Candidate, the PreVoteCandidate must have received votes from a majority in each active configuration + \* Only votes by nodes part of a given configuration should be tallied against it + /\ \A c \in DOMAIN configurations[i] : + (votesGranted[i] \intersect configurations[i][c]) \in Quorums[configurations[i][c]] /\ \* Check that the reconfiguration which added this node is at least committable \/ \E c \in DOMAIN configurations[i] : @@ -641,17 +691,19 @@ BecomeCandidate(i) == \* Candidate votes for itself /\ votedFor' = [votedFor EXCEPT ![i] = i] /\ votesGranted' = [votesGranted EXCEPT ![i] = {i}] - /\ UNCHANGED <> + /\ UNCHANGED <> \* Server i times out (becomes candidate) and votes for itself in the election of the next term \* At some point later (non-deterministically), the candidate will request votes from the other nodes. Timeout(i) == - /\ BecomeCandidate(i) - /\ UNCHANGED messageVars + IF PreVoteEnabled \notin preVoteStatus[i] + THEN BecomeCandidate(i) + ELSE BecomePreVoteCandidate(i) \* Candidate i sends j a RequestVote request. RequestVote(i,j) == LET + isPreVote == leadershipState[i] = PreVoteCandidate msg == [type |-> RequestVoteRequest, term |-> currentTerm[i], \* CCF: Use last signature entry and not last log entry in elections. @@ -659,16 +711,17 @@ RequestVote(i,j) == lastCommittableTerm |-> LastCommittableTerm(i), lastCommittableIndex |-> LastCommittableIndex(i), source |-> i, - dest |-> j] + dest |-> j, + isPreVote |-> isPreVote] IN \* Timeout votes for itself atomically. Thus we do not need to request our own vote. /\ i /= j \* Only requests vote if we are already a candidate (and therefore have not completed retirement) - /\ leadershipState[i] = Candidate + /\ leadershipState[i] \in {PreVoteCandidate, Candidate} \* Reconfiguration: Make sure j is in a configuration of i /\ IsInServerSet(j, i) /\ Send(msg) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Leader i sends j an AppendEntries request AppendEntries(i, j) == @@ -708,7 +761,7 @@ AppendEntries(i, j) == \* Record the most recent index we have sent to this node. \* raft.h::send_append_entries /\ sentIndex' = [sentIndex EXCEPT ![i][j] = @ + Len(m.entries)] - /\ UNCHANGED <> + /\ UNCHANGED <> \* Candidate i transitions to leader. BecomeLeader(i) == @@ -731,7 +784,7 @@ BecomeLeader(i) == \* been rolled back as it was unsigned /\ membershipState' = [membershipState EXCEPT ![i] = IF @ = RetirementOrdered THEN Active ELSE @] - /\ UNCHANGED <> + /\ UNCHANGED <> \* Leader i receives a client request to add to the log. The consensus spec is agnostic to request payloads, \* and does not model or differentiate them. See the consistency spec (tla/consistency/*) for a specification @@ -743,7 +796,7 @@ ClientRequest(i) == /\ membershipState[i] # RetiredCommitted \* Add new request to leader's log /\ log' = [log EXCEPT ![i] = Append(@, [term |-> currentTerm[i], contentType |-> TypeEntry]) ] - /\ UNCHANGED <> + /\ UNCHANGED <> \* CCF: Signed commits \* In CCF, the leader periodically signs the latest log prefix. Only these signatures are committable in CCF. @@ -766,7 +819,7 @@ SignCommittableMessages(i) == /\ IF membershipState[i] = RetirementOrdered THEN membershipState' = [membershipState EXCEPT ![i] = RetirementSigned] ELSE UNCHANGED membershipState - /\ UNCHANGED <> + /\ UNCHANGED <> \* CCF: Reconfiguration of servers \* In the TLA+ model, a reconfiguration is initiated by the Leader which appends an arbitrary new configuration to its own log. @@ -804,7 +857,7 @@ ChangeConfigurationInt(i, newConfiguration) == /\ IF membershipState[i] = Active /\ i \notin newConfiguration THEN membershipState' = [membershipState EXCEPT ![i] = RetirementOrdered] ELSE UNCHANGED membershipState - /\ UNCHANGED <> + /\ UNCHANGED <> ChangeConfiguration(i) == \* Reconfigure to any *non-empty* subset of servers. ChangeConfigurationInt checks that the new @@ -824,7 +877,7 @@ AppendRetiredCommitted(i) == term |-> currentTerm[i], contentType |-> TypeRetired, retired |-> retire_committed_nodes])] - /\ UNCHANGED <> + /\ UNCHANGED <> \* Leader i advances its commitIndex to the next possible Index. @@ -892,15 +945,18 @@ AdvanceCommitIndex(i) == IN Send(msg) ELSE UNCHANGED <> /\ retirementCompleted' = [retirementCompleted EXCEPT ![i] = NextRetirementCompleted(retirementCompleted[i], configurations[i], log[i], commitIndex'[i], i)] - /\ UNCHANGED <> + /\ UNCHANGED <> \* CCF supports checkQuorum which enables a leader to choose to abdicate leadership. CheckQuorum(i) == \* Check node is a leader (and therefore has not completed retirement) /\ leadershipState[i] = Leader + \* There must be an active config which has a replica which is not this node + /\ \E c \in DOMAIN configurations[i]: + \E n \in configurations[i][c]: n /= i /\ leadershipState' = [leadershipState EXCEPT ![i] = Follower] /\ isNewFollower' = [isNewFollower EXCEPT ![i] = TRUE] - /\ UNCHANGED <> + /\ UNCHANGED <> ------------------------------------------------------------------------------ \* Message handlers @@ -915,34 +971,46 @@ HandleRequestVoteRequest(i, j, m) == \* CCF change: Log is only okay up to signatures, \* not any message in the log /\ m.lastCommittableIndex >= MaxCommittableIndex(log[i]) - grant == /\ m.term = currentTerm[i] - /\ logOk - /\ votedFor[i] \in {Nil, j} + grant_pre_vote_disabled == /\ m.term = currentTerm[i] + /\ logOk + /\ votedFor[i] \in {Nil, j} + grant_pre_vote_capable == /\ logOk + /\ IF ~m.isPreVote + THEN /\ m.term = currentTerm[i] + /\ votedFor[i] \in {Nil, j} + ELSE m.term >= currentTerm[i] + grant == IF PreVoteDisabled \in preVoteStatus[i] + THEN grant_pre_vote_disabled + ELSE grant_pre_vote_capable IN /\ m.term <= currentTerm[i] - /\ \/ grant /\ votedFor' = [votedFor EXCEPT ![i] = j] + /\ \/ grant /\ IF PreVoteCapable \in preVoteStatus[i] /\ m.isPreVote + THEN UNCHANGED votedFor + ELSE votedFor' = [votedFor EXCEPT ![i] = j] \/ ~grant /\ UNCHANGED votedFor /\ Reply([type |-> RequestVoteResponse, term |-> currentTerm[i], voteGranted |-> grant, + isPreVote |-> m.isPreVote, source |-> i, dest |-> j], m) - /\ UNCHANGED <> \* Server i receives a RequestVote response from server j with \* m.term = currentTerm[i]. HandleRequestVoteResponse(i, j, m) == /\ m.term = currentTerm[i] - \* Only Candidates need to tally votes - /\ leadershipState[i] = Candidate + \* Only PreVoteCandidates and Candidates need to tally votes + /\ \/ m.isPreVote /\ leadershipState[i] = PreVoteCandidate + \/ ~m.isPreVote /\ leadershipState[i] = Candidate /\ \/ /\ m.voteGranted /\ votesGranted' = [votesGranted EXCEPT ![i] = votesGranted[i] \cup {j}] \/ /\ ~m.voteGranted /\ UNCHANGED votesGranted /\ Discard(m) - /\ UNCHANGED <> \* Server i replies to a AppendEntries request from server j with a NACK @@ -984,17 +1052,17 @@ RejectAppendEntriesRequest(i, j, m, logOk) == source |-> i, dest |-> j], m) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Candidate i steps down to follower in the same term after receiving a message m from a leader in the current term \* Must check that m is an AppendEntries message before returning to follower state ReturnToFollowerState(i, m) == /\ m.term = currentTerm[i] - /\ leadershipState[i] = Candidate + /\ leadershipState[i] \in {PreVoteCandidate, Candidate} /\ leadershipState' = [leadershipState EXCEPT ![i] = Follower] /\ isNewFollower' = [isNewFollower EXCEPT ![i] = TRUE] \* Note that the set of messages is unchanged as m is discarded - /\ UNCHANGED <> \* Follower i receives a AppendEntries from leader j for log entries it already has @@ -1022,7 +1090,7 @@ AppendEntriesAlreadyDone(i, j, index, m) == source |-> i, dest |-> j], m) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Follower i receives an AppendEntries request m where it needs to roll back first \* This action rolls back the log and leaves m in messages for further processing @@ -1038,7 +1106,7 @@ ConflictAppendEntriesRequest(i, index, m) == /\ configurations' = [configurations EXCEPT ![i] = ConfigurationsToIndex(i,Len(new_log))] /\ membershipState' = [membershipState EXCEPT ![i] = CalcMembershipState(log'[i], commitIndex[i], i)] /\ isNewFollower' = [isNewFollower EXCEPT ![i] = FALSE] - /\ UNCHANGED <> + /\ UNCHANGED <> \* Follower i receives an AppendEntries request m from leader j for log entries which directly follow its log NoConflictAppendEntriesRequest(i, j, m) == @@ -1082,7 +1150,7 @@ NoConflictAppendEntriesRequest(i, j, m) == source |-> i, dest |-> j], m) - /\ UNCHANGED <> + /\ UNCHANGED <> AcceptAppendEntriesRequest(i, j, logOk, m) == /\ m.term = currentTerm[i] @@ -1125,7 +1193,7 @@ HandleAppendEntriesResponse(i, j, m) == \* "If AppendEntries fails because of log inconsistency: decrement nextIndex (aka sentIndex +1) and retry" /\ UNCHANGED matchIndex /\ Discard(m) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Any message with a newer term causes the recipient to advance its term first. \* Note that UpdateTerm does not discard message m from the set of messages so this @@ -1136,7 +1204,8 @@ UpdateTerm(i, j, m) == /\ m.term > currentTerm[i] /\ currentTerm' = [currentTerm EXCEPT ![i] = m.term] \* See become_aware_of_new_term() in raft.h:1915 - /\ leadershipState' = [leadershipState EXCEPT ![i] = IF @ \in {Leader, Candidate, None} THEN Follower ELSE @] + /\ leadershipState' = [leadershipState EXCEPT + ![i] = IF @ \in {Leader, Candidate, PreVoteCandidate, None} THEN Follower ELSE @] /\ isNewFollower' = [isNewFollower EXCEPT ![i] = TRUE] /\ votedFor' = [votedFor EXCEPT ![i] = Nil] \* See rollback(last_committable_index()) in raft::become_follower @@ -1148,25 +1217,29 @@ UpdateTerm(i, j, m) == /\ membershipState' = [membershipState EXCEPT ![i] = IF @ = RetirementOrdered THEN Active ELSE @] \* messages is unchanged so m can be processed further. - /\ UNCHANGED <> + /\ UNCHANGED <> \* Responses with stale terms are ignored. DropStaleResponse(i, j, m) == /\ m.term < currentTerm[i] /\ Discard(m) - /\ UNCHANGED <> DropResponseWhenNotInState(i, j, m) == \/ /\ m.type = AppendEntriesResponse /\ leadershipState[i] \in LeadershipStates \ { Leader } \/ /\ m.type = RequestVoteResponse - /\ leadershipState[i] \in LeadershipStates \ { Candidate } + /\ \/ m.isPreVote /\ leadershipState[i] /= PreVoteCandidate + \/ ~m.isPreVote /\ leadershipState[i] /= Candidate /\ Discard(m) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Drop messages if they are irrelevant to the node DropIgnoredMessage(i,j,m) == + \* raft.h::recv_request_vote + \* We specifically always respond to request votes + /\ m.type /= RequestVoteRequest \* Drop messages if... /\ \* .. recipient is still None.. @@ -1181,9 +1254,15 @@ DropIgnoredMessage(i,j,m) == \* This spec requires that a retired node still helps with voting and appending entries to ensure \* the next configurations learns that its retirement has been committed. \/ /\ membershipState[i] = RetiredCommitted - /\ m.type \notin {RequestVoteRequest, AppendEntriesRequest} + /\ m.type /= AppendEntriesRequest + \* raft.h::recv_append_entries + \* We drop append entries which start before the commit index + \* This is safe as sentIndex will still be incremented allowing subsequent AppendEntries + \* to start later and hence it won't livelock + \/ /\ m.type = AppendEntriesRequest + /\ m.prevLogIndex < commitIndex[i] /\ Discard(m) - /\ UNCHANGED <> + /\ UNCHANGED <> \* Receive a message. @@ -1259,6 +1338,7 @@ Receive(i, j) == \* Defines how the variables may transition, given a node i. NextInt(i) == \/ Timeout(i) + \/ (PreVoteEnabled \in preVoteStatus[i] /\ BecomeCandidate(i)) \/ BecomeLeader(i) \/ ClientRequest(i) \/ SignCommittableMessages(i) @@ -1288,6 +1368,7 @@ Fairness == /\ \A s \in Servers : WF_vars(SignCommittableMessages(s)) /\ \A s \in Servers : WF_vars(AdvanceCommitIndex(s)) /\ \A s \in Servers : WF_vars(AppendRetiredCommitted(s)) + /\ \A s \in Servers : WF_vars(PreVoteEnabled \in preVoteStatus[s] /\ BecomeCandidate(s)) /\ \A s \in Servers : WF_vars(BecomeLeader(s)) /\ \A s \in Servers : WF_vars(Timeout(s)) /\ \A s \in Servers : WF_vars(ChangeConfiguration(s)) @@ -1589,8 +1670,17 @@ PermittedLogChangesProp == StateTransitionsProp == [][\A i \in Servers : /\ leadershipState[i] = None => leadershipState'[i] \in {None, Follower} - /\ leadershipState[i] = Follower => leadershipState'[i] \in {Follower, Candidate} - /\ leadershipState[i] = Candidate => leadershipState'[i] \in {Follower, Candidate, Leader} + /\ IF PreVoteEnabled \notin preVoteStatus[i] + THEN + \* A follower can become a Candidate via a timeout or a ProposeVoteRequest + /\ leadershipState[i] = Follower => leadershipState'[i] \in {Follower, Candidate} + /\ leadershipState[i] /= PreVoteCandidate + /\ leadershipState[i] = Candidate => leadershipState'[i] \in {Follower, Candidate, Leader} + ELSE + \* A follower can become a PreVoteCandidate via a timeout, or a Candidate via ProposeVoteRequest + /\ leadershipState[i] = Follower => leadershipState'[i] \in {Follower, PreVoteCandidate, Candidate} + /\ leadershipState[i] = PreVoteCandidate => leadershipState'[i] \in {Follower, PreVoteCandidate, Candidate} + /\ leadershipState[i] = Candidate => leadershipState'[i] \in {Follower, PreVoteCandidate, Candidate, Leader} /\ leadershipState[i] = Leader => leadershipState'[i] \in {Follower, Leader} ]_vars @@ -1621,6 +1711,13 @@ LeaderProp == \* There is repeatedly a non-retired leader. []<><<\E i \in Servers : leadershipState[i] = Leader /\ membershipState[i] # RetiredCommitted>>_vars +VotesGrantedMonotonicProp == + \A i \in Servers: [][ + /\ leadershipState[i] = leadershipState'[i] + /\ currentTerm[i] = currentTerm'[i] + => votesGranted[i] \subseteq votesGranted'[i] + ]_vars + ------------------------------------------------------------------------------ \* Refinement of the more high-level specification abs.tla that abstracts the \* asynchronous network and the message passing between nodes. Instead, any @@ -1687,4 +1784,9 @@ DebugCommittedEntriesTermsInv == k <= commitIndex[i] => log[i][k].term >= log[j][k].term +DebugNonTrivialLeaderInv == + ~\E i \in Servers: + /\ leadershipState[i] = Leader + /\ currentTerm[i] > StartTerm + =============================================================================== \ No newline at end of file diff --git a/tla/install_deps.py b/tla/install_deps.py index 126bcdd1f188..493420e89636 100644 --- a/tla/install_deps.py +++ b/tla/install_deps.py @@ -71,7 +71,8 @@ def _parse_args() -> argparse.Namespace: ) parser.add_argument( - "--skip-apt-packages", action="store_false", default=True, dest="apt_packages" + "--tdnf-extended", + action="store_true", ) return parser.parse_args() @@ -89,6 +90,21 @@ def install_tlc(): def install_deps(args: argparse.Namespace): + if args.tdnf_extended: + with open("/etc/yum.repos.d/tdnf.repo", "w", encoding="utf-8") as tdnf_repo: + tdnf_repo.write( + """[azurelinux-official-extended] + name=Azure Linux Official Extended $releasever $basearch + baseurl=https://packages.microsoft.com/azurelinux/$releasever/prod/extended/$basearch + gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY + gpgcheck=1 + repo_gpgcheck=1 + enabled=1 + skip_if_unavailable=True + sslverify=1""" + ) + subprocess.check_call(["tdnf", "install", "-y", "parallel"]) + # Setup tools directory tools_dir = os.path.join(TLA_DIR, "tools") @@ -112,17 +128,6 @@ def create_tools_dir(): dest=tools_dir, ) - if args.apt_packages: - subprocess.Popen( - "sudo apt-get install -y --no-install-recommends".split() - + [ - "wget", - "graphviz", - "htop", - "texlive-latex-recommended", - ] - ).wait() - fetch_latest( url="https://nightly.tlapl.us/dist/tla2tools.jar", dest=TLA_DIR, diff --git a/tla/tlc.py b/tla/tlc.py index 63a9f0f01de9..ec7ed4b9f7cd 100755 --- a/tla/tlc.py +++ b/tla/tlc.py @@ -9,6 +9,9 @@ # - Capture useful switches for CI, debugging # - Expose specification configuration through CLI # - Provide a useful --help, and basic sanity checks +# +# Requires that tla2tools.jar and CommunityModules-deps.jar are in the same directory as this script +# See install_deps.py import os import sys @@ -128,12 +131,30 @@ def cli(): action="store_true", help="Set TLC to use depth-first search", ) - tv.add_argument( + tv_group = tv.add_mutually_exclusive_group() + tv_group.add_argument( "--ccf-raft-trace", type=pathlib.Path, default=None, help="Path to a CCF Raft trace .ndjson file, for example produced by make_traces.sh", ) + tv_group.add_argument( + "--scenario", + default=None, + help="Path to a specific scenario file to run. If provided will generate the trace from this scenario and validate it.", + ) + + # scenario trace generation + tv.add_argument( + "--raft-driver", + default="../build/raft_driver", + help="Path to the raft_driver binary", + ) + tv.add_argument( + "--scenarios-runner", + default="../tests/raft_scenarios_runner.py", + help="Path to the raft_scenarios_runner.py script", + ) # Simulation sim = subparsers.add_parser("sim", help="Simulation") @@ -223,6 +244,27 @@ def cli(): jvm_args.append("-Dtlc2.tool.queue.IStateQueue=StateDeque") if args.ccf_raft_trace is not None: env["CCF_RAFT_TRACE"] = args.ccf_raft_trace + if args.scenario is not None: + # Generate the trace from the scenario using the scenarios runner + trace_dir = "traces" + cmd = [ + sys.executable, + args.scenarios_runner, + args.raft_driver, + "--output", + trace_dir, + args.scenario, + ] + print(f"Generating trace from scenario with command: {shlex.join(cmd)}") + ret = os.system(shlex.join(cmd)) + if ret != 0: + print(f"Error generating trace from scenario, exited with code {ret}") + sys.exit(ret) + print(f"Generated trace in directory: {trace_dir}") + trace_path = os.path.join( + trace_dir, os.path.basename(args.scenario) + ".ndjson" + ) + env["CCF_RAFT_TRACE"] = trace_path elif args.cmd == "sim": tlc_args.extend(["-simulate"]) if args.num is not None: