From 5a7a27bc1a8e6889ce7cc9ea8acdf0e3d0f9ce2f Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Sun, 29 Mar 2020 16:59:11 +0530 Subject: [PATCH 01/23] Create Consensus-Protocol.md Adds basic skeleton of the design doc --- .../parser/scanner/Consensus-Protocol.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 doc/internal/parser/scanner/Consensus-Protocol.md diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md new file mode 100644 index 00000000..071c12c1 --- /dev/null +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -0,0 +1,16 @@ +# The consensus + +Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. + +## The Raft protocol + + +## A strict testing mechanism + +The testing mechanism to be implemented will enable us in figuring out the problems existing in the implementation leading to a more resilient system. +We have to test for the following basic failures: +1. Network partitioning. +2. Un-responsive peers. +3. Overloading peer. +4. Corruption of data in transit. + From df185c308e18648ea7a57e392ccd6dfcc547835e Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Sun, 29 Mar 2020 20:09:03 +0530 Subject: [PATCH 02/23] Update Consensus-Protocol.md --- doc/internal/parser/scanner/Consensus-Protocol.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 071c12c1..37cb6a64 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -14,3 +14,6 @@ We have to test for the following basic failures: 3. Overloading peer. 4. Corruption of data in transit. +## Graceful handling of failures + +Accepting failures exist and handling them gracefully enables creation of more resilient and stabler systems. Having _circuit breakers_, _backoff mechanisms in clients_ and _validation and coordination mechanisms_ are some of the pointers to be followed. From 099498af2a4585b56f5c9d90e99c346a05fab576 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Mon, 30 Mar 2020 15:05:23 +0530 Subject: [PATCH 03/23] Update doc/internal/parser/scanner/Consensus-Protocol.md Co-Authored-By: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/parser/scanner/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 37cb6a64..7bc19a6e 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -16,4 +16,4 @@ We have to test for the following basic failures: ## Graceful handling of failures -Accepting failures exist and handling them gracefully enables creation of more resilient and stabler systems. Having _circuit breakers_, _backoff mechanisms in clients_ and _validation and coordination mechanisms_ are some of the pointers to be followed. +Accepting failures exist and handling them gracefully enables creation of more resilient and stable systems. Having _circuit breakers_, _backoff mechanisms in clients_ and _validation and coordination mechanisms_ are some of the pointers to be followed. From 43e600e70e0ba43782de30d73f08cd39b02b0bd1 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Mon, 30 Mar 2020 21:47:49 +0530 Subject: [PATCH 04/23] Update Consensus-Protocol.md --- doc/internal/parser/scanner/Consensus-Protocol.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 7bc19a6e..35b8f529 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -1,9 +1,18 @@ # The consensus +Before talking about consensus, we need to discuss some logistics based on how the systems can co-exist. + +* Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The standard Go package [RPC](https://golang.org/pkg/net/rpc/) serves us the purpose. +* Security: Access control mechanisms need to be in place to decide on access to functions in the servers based on their state (leader, follower, candidate) + Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. ## The Raft protocol +A raft server may be in any of the 3 states; leader, follower or candidate. All requests are serviced through the leader and it then decides how and if the logs must be replicated in the follower machines. The raft protocol has 3 almost independent modules: +1. Leader Election +2. Log Replication +3. Safety ## A strict testing mechanism From bd90b1fce19f4b8979f25440a83e753cbef59e9b Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Wed, 1 Apr 2020 00:22:46 +0530 Subject: [PATCH 05/23] Update Consensus-Protocol.md --- .../parser/scanner/Consensus-Protocol.md | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 35b8f529..1cb8131c 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -14,6 +14,27 @@ A raft server may be in any of the 3 states; leader, follower or candidate. All 2. Log Replication 3. Safety +A detailed description of all the modules follow: + +### Leader Election +* Startup: All servers start in the follower state and begin by requesting votes to be elected as a leader. +* Pre-election: The server increments its `currentTerm` by one, changes to `candidate` state and sends out `RequestVotes` RPC parallely to all the peers. +* Vote condition: FCFS basis. If there was no request to the server, it votes for itself (read 3.6 and clear out when to vote for itself) +* Election timeout: A preset time for which the server waits to see if a peer requested a vote. It is randomly chosen between 150-300ms. +* Election is repeated after an election timeout until: + 1. The server wins the election + 2. A peer establishes itself as leader. + 3. Election timer times out or a split vote occurs (leading to no leader) and the process will be repeated. + * Election win: Majority votes in the term. (More details in safety) The state of the winner is now `Leader` and the others are `Followers`. + * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. + * Transition from working state to Election happens when a leader fails. + * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the "waiter"'s term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. + * The split vote problem: Though not that common, split votes can occur. To make sure this doesnt continue indefinitely, election timeouts are randomised, + +### Log Replication + +### Safety + ## A strict testing mechanism The testing mechanism to be implemented will enable us in figuring out the problems existing in the implementation leading to a more resilient system. From 7d26490d7c74bb1738129ca0d2154535317f8646 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 2 Apr 2020 00:20:25 +0530 Subject: [PATCH 06/23] Update Consensus-Protocol.md --- .../parser/scanner/Consensus-Protocol.md | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 1cb8131c..01cf17b0 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -17,6 +17,8 @@ A raft server may be in any of the 3 states; leader, follower or candidate. All A detailed description of all the modules follow: ### Leader Election + +#### Spec * Startup: All servers start in the follower state and begin by requesting votes to be elected as a leader. * Pre-election: The server increments its `currentTerm` by one, changes to `candidate` state and sends out `RequestVotes` RPC parallely to all the peers. * Vote condition: FCFS basis. If there was no request to the server, it votes for itself (read 3.6 and clear out when to vote for itself) @@ -29,10 +31,22 @@ A detailed description of all the modules follow: * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. * Transition from working state to Election happens when a leader fails. * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the "waiter"'s term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. - * The split vote problem: Though not that common, split votes can occur. To make sure this doesnt continue indefinitely, election timeouts are randomised, + * The split vote problem: Though not that common, split votes can occur. To make sure this doesnt continue indefinitely, election timeouts are randomised, making the split votes less probable. + +#### Implementation + ### Log Replication +#### Spec + +* Pre-log replication: Once a leader is elected, it starts servicing the client. The leader appends a new request to its `New Entry` log then issues `AppendEntriesRPC` in parallel to all its peers. +* Successful log: When all logs have been applied successfully to all follower machines, the leader applies the entry to its state machine and returns the result to the client. +* Repeating `AppendEntries`: `AppendEntriesRPC` are repeated indefinitely until all followers eventually store all log entries. +* Log entry storage: + +#### Implementation + ### Safety ## A strict testing mechanism From 5923f4fa8ae809e235d067a73f85c4b92080c788 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 2 Apr 2020 15:14:31 +0530 Subject: [PATCH 07/23] Update Consensus-Protocol.md --- doc/internal/parser/scanner/Consensus-Protocol.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 01cf17b0..24e88933 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -4,6 +4,7 @@ Before talking about consensus, we need to discuss some logistics based on how t * Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The standard Go package [RPC](https://golang.org/pkg/net/rpc/) serves us the purpose. * Security: Access control mechanisms need to be in place to decide on access to functions in the servers based on their state (leader, follower, candidate) +* Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and simply forward this request to the current leader; OR have a proxy that can forward the request to the current leader wheneve the requests come in. Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. @@ -61,3 +62,8 @@ We have to test for the following basic failures: ## Graceful handling of failures Accepting failures exist and handling them gracefully enables creation of more resilient and stable systems. Having _circuit breakers_, _backoff mechanisms in clients_ and _validation and coordination mechanisms_ are some of the pointers to be followed. + +## Running Lbadd on Raft + +* Background: Raft is just a consensus protocol that helps keep different database servers in sync. We need methods to issue a command and enable the sync between servers. +* Logistics: The `AppendEntriesRPC` will have the command to be executed by the client. This command goes through the leader, is applied by all the followers and then committed by the leader. Thus ensuring an in-sync distributed database. From 02239c8e72b9dbffee0bd6ab3e22e232b02b26a2 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 3 Apr 2020 20:48:03 +0530 Subject: [PATCH 08/23] Update Consensus-Protocol.md --- doc/internal/parser/scanner/Consensus-Protocol.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 24e88933..8340175d 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -4,7 +4,8 @@ Before talking about consensus, we need to discuss some logistics based on how t * Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The standard Go package [RPC](https://golang.org/pkg/net/rpc/) serves us the purpose. * Security: Access control mechanisms need to be in place to decide on access to functions in the servers based on their state (leader, follower, candidate) -* Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and simply forward this request to the current leader; OR have a proxy that can forward the request to the current leader wheneve the requests come in. +* Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and simply forward this request to the current leader; OR have a proxy that can forward the request to the current leader wheneve the requests come in. (Section client interaction of post has another approach which works too) +* The servers will be implemented in the `interal/master` or `internal/worker` folders which will import the raft API and perform their functions. Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. @@ -37,6 +38,8 @@ A detailed description of all the modules follow: #### Implementation +* A separate `interal/raft` folder will have a raft implementation which provides APIs for each server to call. +* ### Log Replication #### Spec From 2b900c00819598ca4c19532c32c425758644d019 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Sat, 4 Apr 2020 23:30:01 +0530 Subject: [PATCH 09/23] Update Consensus-Protocol.md --- doc/internal/parser/scanner/Consensus-Protocol.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/parser/scanner/Consensus-Protocol.md index 8340175d..067b4cf4 100644 --- a/doc/internal/parser/scanner/Consensus-Protocol.md +++ b/doc/internal/parser/scanner/Consensus-Protocol.md @@ -47,7 +47,9 @@ A detailed description of all the modules follow: * Pre-log replication: Once a leader is elected, it starts servicing the client. The leader appends a new request to its `New Entry` log then issues `AppendEntriesRPC` in parallel to all its peers. * Successful log: When all logs have been applied successfully to all follower machines, the leader applies the entry to its state machine and returns the result to the client. * Repeating `AppendEntries`: `AppendEntriesRPC` are repeated indefinitely until all followers eventually store all log entries. -* Log entry storage: +* Log entry storage: Log entries are a queue of state machine commands which are applied to that particular state machine. Log entries are associated with a term number to indicate the term of application of that log along with an integer index to identify a particular logs position. +* Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. Once a follower learns that +* Add more from section 3.6 #### Implementation From b18332a7da6c4c0870963c7b505b2baea8413899 Mon Sep 17 00:00:00 2001 From: SUMUKHA-PK Date: Fri, 10 Apr 2020 10:51:34 +0530 Subject: [PATCH 10/23] Moved doc to appropriate folder --- doc/internal/{parser/scanner => consensus}/Consensus-Protocol.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename doc/internal/{parser/scanner => consensus}/Consensus-Protocol.md (100%) diff --git a/doc/internal/parser/scanner/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md similarity index 100% rename from doc/internal/parser/scanner/Consensus-Protocol.md rename to doc/internal/consensus/Consensus-Protocol.md From b11486efa0a231809dc760c6b82e6a26385814f0 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 1 May 2020 18:28:21 +0530 Subject: [PATCH 11/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 067b4cf4..18a4c633 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -48,7 +48,13 @@ A detailed description of all the modules follow: * Successful log: When all logs have been applied successfully to all follower machines, the leader applies the entry to its state machine and returns the result to the client. * Repeating `AppendEntries`: `AppendEntriesRPC` are repeated indefinitely until all followers eventually store all log entries. * Log entry storage: Log entries are a queue of state machine commands which are applied to that particular state machine. Log entries are associated with a term number to indicate the term of application of that log along with an integer index to identify a particular logs position. -* Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. Once a follower learns that +* Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. +* The leader keeps track of the highest known index that it knows is committed and it is included in all the future `AppendEntriesRPC` (including heartbeats) to inform other servers. +* Theres some issue about log committing - paragraph says its committed when its applied everywhere and also says its applied everywhere once its committed. +* Log matching property: + * If two entries in different logs have the same index and term, then they store the same com-mand. + * If two entries in different logs have the same index and term, then the logs are identical in allpreceding entries. + * * Add more from section 3.6 #### Implementation From 7eef110cfb1be7ad5e819b00fe4b505e7fef0339 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Wed, 6 May 2020 21:38:55 +0530 Subject: [PATCH 12/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 63 +++++++++++++++++--- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 18a4c633..70e0e405 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -2,15 +2,24 @@ Before talking about consensus, we need to discuss some logistics based on how the systems can co-exist. -* Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The standard Go package [RPC](https://golang.org/pkg/net/rpc/) serves us the purpose. +* Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The existing `network` layer of the database will handle all the communication between servers. * Security: Access control mechanisms need to be in place to decide on access to functions in the servers based on their state (leader, follower, candidate) * Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and simply forward this request to the current leader; OR have a proxy that can forward the request to the current leader wheneve the requests come in. (Section client interaction of post has another approach which works too) -* The servers will be implemented in the `interal/master` or `internal/worker` folders which will import the raft API and perform their functions. +* The servers will be implemented in the `interal/node` folders which will import the raft API and perform their functions. Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. ## The Raft protocol +The raft protocol will be implemented in `internal/raft` and will implement APIs that each node can call. + +Raft is an algorithm to handle replicated log, and we maintain the "log" of the SQL stmts applied on a DB and have a completely replicated cluster. + +#### General Implementation rules: +* All RPC calls are done parallely to obtain the best performance. +* Request retries are done in case of network failures. +* Raft does not assume network preserves ordering of the packets. + A raft server may be in any of the 3 states; leader, follower or candidate. All requests are serviced through the leader and it then decides how and if the logs must be replicated in the follower machines. The raft protocol has 3 almost independent modules: 1. Leader Election 2. Log Replication @@ -30,6 +39,7 @@ A detailed description of all the modules follow: 2. A peer establishes itself as leader. 3. Election timer times out or a split vote occurs (leading to no leader) and the process will be repeated. * Election win: Majority votes in the term. (More details in safety) The state of the winner is now `Leader` and the others are `Followers`. + * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updatesits current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. * Transition from working state to Election happens when a leader fails. * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the "waiter"'s term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. @@ -38,8 +48,9 @@ A detailed description of all the modules follow: #### Implementation -* A separate `interal/raft` folder will have a raft implementation which provides APIs for each server to call. -* +* The raft module will provide a `StartElection` function that enables a node to begin election. This function just begins the election and doesnt return any result of the election. The decision of the election will be handled by the votes and each server independently. +* The Leader node is the only one to know its the leader in the beginning. It realises it has obtained the majority votes, and starts behaving like the leader node. During this period, other nodes wait for a possible leader and begin to proceed in the candidate state by advancing to the next term unless the leader contacts them. + ### Log Replication #### Spec @@ -48,19 +59,46 @@ A detailed description of all the modules follow: * Successful log: When all logs have been applied successfully to all follower machines, the leader applies the entry to its state machine and returns the result to the client. * Repeating `AppendEntries`: `AppendEntriesRPC` are repeated indefinitely until all followers eventually store all log entries. * Log entry storage: Log entries are a queue of state machine commands which are applied to that particular state machine. Log entries are associated with a term number to indicate the term of application of that log along with an integer index to identify a particular logs position. -* Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. +* A committed entry: When a leader decides that the log entry is safe to apply to other state machines, that entry is called committed. All committed entries are durable and _will eventually be executed_ by all state machines. +* An entry -> Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. * The leader keeps track of the highest known index that it knows is committed and it is included in all the future `AppendEntriesRPC` (including heartbeats) to inform other servers. -* Theres some issue about log committing - paragraph says its committed when its applied everywhere and also says its applied everywhere once its committed. +* Theres some issue about log committing - "A log entry is committed once the leader that createdthe entry has replicated it on a majority of the servers" and " Once a follower learns that a log entry is committed, it applies theentry to its local state machine (in log order)." are not clear whether replicating and applying to state machine are the same. If they are its kind of a contradiction, else "aplication" can mean executing the STMT in the DB in our case. * Log matching property: * If two entries in different logs have the same index and term, then they store the same com-mand. * If two entries in different logs have the same index and term, then the logs are identical in allpreceding entries. - * -* Add more from section 3.6 +* When sending an AppendEntriesRPC, the leader includes the index and term of the entry in its log that immediately precedes thenew entries. If the follower does not find an entry in its log with the same index and term, then itrefuses the new entries. This helps in log matching. Which implies, a successful `AppendEntries` RPC means a synced log. +* Leader crashes inducing inconsistencies in logs: In Raft, the leader handles inconsistencies by forcing the followers’ logs to duplicate its own (the leader's). To do so, the leader must find the latest log entrywhere the two logs agree, delete any entries in the follower’s log after that point, and send thefollower all of the leader’s entries after that point. All of these actions happen in response to theconsistency check performed by AppendEntries RPCs. Meaning, the leader checks for the consistency by maintaining a `nextIndex` value and dropping it down and sending `AppendEntries` RPC (which does the consistency check and fails unless they're same) until a success is returned. These "check `AppendEntries` can be empty to save BandWidth(BW). The follower can also help here by sending the smallest agreeing index in the first RPC instead of the leader probing until it reaches the index. +* Leader append-only property: A leader never overwrites or deletes entries in its own log. #### Implementation ### Safety +This module ensures that the above protocol runs as expected, eliminating the corner cases. + +#### Spec + +* Election restriction: Raft uses the voting process to prevent a candidate from winning an election unless its log contains all committed entries. A candidate must contact a majority of the cluster in order to be elected, which means that every committed entry must be present in at least one of those servers. If the candidate’s log is at least as up-to-date as any other log in that majority, then it will hold all the committed entries. The `RequestVote` RPC implements thisrestriction: the RPC includes information about the candidate’s log, and the voter denies its vote if its own log is more up-to-date than that of the candidate. +* The decision of _which is the more updated_ log is based on the index and the term of the last log. Term gets priority, if terms are same, index is checked for precedence. +* Committing from previous term: Raft never commits log entries from previous terms by counting replicas. Only log entries from the leader’s current term are committed bycounting replicas; once an entry from the current term has been committed in this way, then all prior entries are committed indirectly because of the Log Matching Property. +* Follower or Candidate crashes: `RequestVotes` or `AppendEntries` RPC are tried indefinetely. Raft RPC are _idempotent_ making the retries harmless. (Possible judgement call: Should I wait for the results of the votes to base it for my decision in the election?) +* `Current term`, `last applied index` and `vote` that was casted along with the logs must be persisted in case of server crashes. + +#### Implementation + +#### Client interaction: +* Idempotency is maintained by having a unique client ID and the request ID. The same request by the same client cannot be requested twice, we assume here that the client didn't receive the responses due to network errors etc. Each server maintains a _session_ for each client. The session tracks the latest serial number processed for the client, along with the associated response. If a server receives a command whose serial number has already been executed, it responds immediately without re-executing the request. +* With each request, the client includes the lowest sequencenumber for which it has not yet received a response, and the state machine then discards all responsesfor lower sequence numbers. Quite similar to TCP. +* The session for a client are _open on all nodes_. Session expiry happens on all nodes at once and in a deterministic way. LRU or an upper bound for sessions can be used. Some sort of timely probing is done to remove stale sessions. +* Live clients issue keep-alive requests during periods of inactivity, which are also augmented withthe leader’s timestamp and committed to the Raft log, in order to maintain their sessions. +* Reads can bypass the Raft log only if: + * If the leader has not yet marked an entry from its current term committed, it waits until ithas done so. The Leader Completeness Property guarantees that a leader has all committed entries, but at the start of its term, it may not know which those are. To find out, it needs to commit an entry from its term. Raft handles this by having each leader commit a blank `no-op` entry into the log at the start of its term. As soon as this no-op entry is committed, the leader’scommit index will be at least as large as any other servers’ during its term. + * The leader saves its current commit index in a local variable `readIndex`. This will be used as a lower bound for the version of the state that the query operates against. + * The leader needs to make sure it hasn’t been superseded by a newer leader of which it is unaware. It issues a new round of heartbeats and waits for their acknowledgments from amajority of the cluster. Once these acknowledgments are received, the leader knows that there could not have existed a leader for a greater term at the moment it sent the heartbeats. Thus, the readIndex was, at the time, the largest commit index ever seen by any server in the cluster. + * The leader waits for its state machine to advance at least as far as the readIndex; this is current enough to satisfy linearizability + * Finally, the leader issues the query against its state machine and replies to the client with the results. + + ## A strict testing mechanism The testing mechanism to be implemented will enable us in figuring out the problems existing in the implementation leading to a more resilient system. @@ -78,3 +116,12 @@ Accepting failures exist and handling them gracefully enables creation of more r * Background: Raft is just a consensus protocol that helps keep different database servers in sync. We need methods to issue a command and enable the sync between servers. * Logistics: The `AppendEntriesRPC` will have the command to be executed by the client. This command goes through the leader, is applied by all the followers and then committed by the leader. Thus ensuring an in-sync distributed database. +* Each server will independently run the SQL statement once the statement is committed. This ensures that the state of the database is in sync. The data that moves around in the raft cluster can be a compiled SQL statement that each node will run independently. + + +## Appendix + +* The difference between _commit_, _replicate_ and _apply_ with respect raft: What I have understood till now is, applying means letting the log run through the node's state machine. This is the end process, happens after a commit. A commit is once replication happens on a majority of the nodes. While replication is simply appending of a log on _one_ node. +* Some gotchas I thought about: + * Client connects to the leader and leader crashes -> reject the connection. Let the client connect when the new leader is established. + * Some sort of idempotency must be maintained w.r.t. the client-cluster communication. Multiple requests submitted by the client should not cause problems due to network errors. From 1209205b00cdef853b86e0d9d40be9e6f6b47e38 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 7 May 2020 23:15:18 +0530 Subject: [PATCH 13/23] Update doc/internal/consensus/Consensus-Protocol.md Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 70e0e405..8f897882 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -67,7 +67,7 @@ A detailed description of all the modules follow: * If two entries in different logs have the same index and term, then they store the same com-mand. * If two entries in different logs have the same index and term, then the logs are identical in allpreceding entries. * When sending an AppendEntriesRPC, the leader includes the index and term of the entry in its log that immediately precedes thenew entries. If the follower does not find an entry in its log with the same index and term, then itrefuses the new entries. This helps in log matching. Which implies, a successful `AppendEntries` RPC means a synced log. -* Leader crashes inducing inconsistencies in logs: In Raft, the leader handles inconsistencies by forcing the followers’ logs to duplicate its own (the leader's). To do so, the leader must find the latest log entrywhere the two logs agree, delete any entries in the follower’s log after that point, and send thefollower all of the leader’s entries after that point. All of these actions happen in response to theconsistency check performed by AppendEntries RPCs. Meaning, the leader checks for the consistency by maintaining a `nextIndex` value and dropping it down and sending `AppendEntries` RPC (which does the consistency check and fails unless they're same) until a success is returned. These "check `AppendEntries` can be empty to save BandWidth(BW). The follower can also help here by sending the smallest agreeing index in the first RPC instead of the leader probing until it reaches the index. +* Leader crashes inducing inconsistencies in logs: In Raft, the leader handles inconsistencies by forcing the followers’ logs to duplicate its own (the leader's). To do so, the leader must find the latest log entry where the two logs agree, delete any entries in the follower’s log after that point, and send the follower all of the leader’s entries after that point. All of these actions happen in response to the consistency check performed by AppendEntries RPCs. Meaning, the leader checks for the consistency by maintaining a `nextIndex` value and dropping it down and sending `AppendEntriesRPC` (which does the consistency check and fails unless they're same) until a success is returned. These "check `AppendEntries` can be empty to save BandWidth(BW). The follower can also help here by sending the smallest agreeing index in the first RPC instead of the leader probing until it reaches the index. * Leader append-only property: A leader never overwrites or deletes entries in its own log. #### Implementation From 8d25041dbe3b81eb3f0c02964980e8fa50bfa79f Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 7 May 2020 23:17:07 +0530 Subject: [PATCH 14/23] Apply suggestions from code review Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 8f897882..a17ae3d5 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -42,8 +42,8 @@ A detailed description of all the modules follow: * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updatesits current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. * Transition from working state to Election happens when a leader fails. - * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the "waiter"'s term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. - * The split vote problem: Though not that common, split votes can occur. To make sure this doesnt continue indefinitely, election timeouts are randomised, making the split votes less probable. + * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the waiting node's term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. + * The split vote problem: Though not that common, split votes can occur. To make sure this doesn't continue indefinitely, election timeouts are randomised, making the split votes less probable. #### Implementation @@ -62,11 +62,11 @@ A detailed description of all the modules follow: * A committed entry: When a leader decides that the log entry is safe to apply to other state machines, that entry is called committed. All committed entries are durable and _will eventually be executed_ by all state machines. * An entry -> Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. * The leader keeps track of the highest known index that it knows is committed and it is included in all the future `AppendEntriesRPC` (including heartbeats) to inform other servers. -* Theres some issue about log committing - "A log entry is committed once the leader that createdthe entry has replicated it on a majority of the servers" and " Once a follower learns that a log entry is committed, it applies theentry to its local state machine (in log order)." are not clear whether replicating and applying to state machine are the same. If they are its kind of a contradiction, else "aplication" can mean executing the STMT in the DB in our case. +* Theres some issue about log committing - "A log entry is committed once the leader that createdthe entry has replicated it on a majority of the servers" and " Once a follower learns that a log entry is committed, it applies theentry to its local state machine (in log order)." are not clear whether replicating and applying to state machine are the same. If they are its kind of a contradiction, else "application" can mean executing the STMT in the DB in our case. * Log matching property: - * If two entries in different logs have the same index and term, then they store the same com-mand. - * If two entries in different logs have the same index and term, then the logs are identical in allpreceding entries. -* When sending an AppendEntriesRPC, the leader includes the index and term of the entry in its log that immediately precedes thenew entries. If the follower does not find an entry in its log with the same index and term, then itrefuses the new entries. This helps in log matching. Which implies, a successful `AppendEntries` RPC means a synced log. + * If two entries in different logs have the same index and term, then they store the same command. + * If two entries in different logs have the same index and term, then the logs are identical in all preceding entries. +* When sending an AppendEntriesRPC, the leader includes the index and term of the entry in its log that immediately precedes the new entries. If the follower does not find an entry in its log with the same index and term, then it refuses the new entries. This helps in log matching. Which implies, a successful `AppendEntriesRPC` means a synced log. * Leader crashes inducing inconsistencies in logs: In Raft, the leader handles inconsistencies by forcing the followers’ logs to duplicate its own (the leader's). To do so, the leader must find the latest log entry where the two logs agree, delete any entries in the follower’s log after that point, and send the follower all of the leader’s entries after that point. All of these actions happen in response to the consistency check performed by AppendEntries RPCs. Meaning, the leader checks for the consistency by maintaining a `nextIndex` value and dropping it down and sending `AppendEntriesRPC` (which does the consistency check and fails unless they're same) until a success is returned. These "check `AppendEntries` can be empty to save BandWidth(BW). The follower can also help here by sending the smallest agreeing index in the first RPC instead of the leader probing until it reaches the index. * Leader append-only property: A leader never overwrites or deletes entries in its own log. From 01d0c32156e838dd8b2c35e990bd1dcf2c72d63c Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 7 May 2020 23:17:48 +0530 Subject: [PATCH 15/23] Apply suggestions from code review Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index a17ae3d5..67c8370d 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -39,7 +39,7 @@ A detailed description of all the modules follow: 2. A peer establishes itself as leader. 3. Election timer times out or a split vote occurs (leading to no leader) and the process will be repeated. * Election win: Majority votes in the term. (More details in safety) The state of the winner is now `Leader` and the others are `Followers`. - * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updatesits current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. + * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updates its current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. * Transition from working state to Election happens when a leader fails. * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the waiting node's term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. From d354ab942bfde31041f62f66b6a330a85764f740 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Thu, 7 May 2020 23:18:24 +0530 Subject: [PATCH 16/23] Apply suggestions from code review Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 67c8370d..d7add717 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -13,7 +13,7 @@ Maintaining consensus is one of the major parts of a distributed system. To know The raft protocol will be implemented in `internal/raft` and will implement APIs that each node can call. -Raft is an algorithm to handle replicated log, and we maintain the "log" of the SQL stmts applied on a DB and have a completely replicated cluster. +Raft is an algorithm to handle replicated log, and we maintain the "log" of the SQL statements applied on a DB and have a completely replicated cluster. #### General Implementation rules: * All RPC calls are done parallely to obtain the best performance. From 2a8c4d9e85af22da287feacaefa8c0d6fc137512 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 8 May 2020 09:47:41 +0530 Subject: [PATCH 17/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index d7add717..f00e8473 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -2,9 +2,9 @@ Before talking about consensus, we need to discuss some logistics based on how the systems can co-exist. -* Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The existing `network` layer of the database will handle all the communication between servers. +* Communication: Distributed systems need a method to communicate between each other. Remote Procedure Calls is the mechanism using which a standalone server can talk to another. The existing `network` layer of the database will handle all the communication between servers. However, the messages to be passed are decided by the raft layer. * Security: Access control mechanisms need to be in place to decide on access to functions in the servers based on their state (leader, follower, candidate) -* Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and simply forward this request to the current leader; OR have a proxy that can forward the request to the current leader wheneve the requests come in. (Section client interaction of post has another approach which works too) +* Routing to leader: One of the issues with a varying leader is for the clients to know which IP address to contact for the service. We can solve this problem by advertising any/all IPs of the cluster and the client returns the IP of the leader if its not the leader. * The servers will be implemented in the `interal/node` folders which will import the raft API and perform their functions. Maintaining consensus is one of the major parts of a distributed system. To know to have achieved a stable system, we need the following two parts of implementation. @@ -32,16 +32,16 @@ A detailed description of all the modules follow: #### Spec * Startup: All servers start in the follower state and begin by requesting votes to be elected as a leader. * Pre-election: The server increments its `currentTerm` by one, changes to `candidate` state and sends out `RequestVotes` RPC parallely to all the peers. -* Vote condition: FCFS basis. If there was no request to the server, it votes for itself (read 3.6 and clear out when to vote for itself) +* Vote condition: FCFS basis. If there was no request to the server, it votes for itself. * Election timeout: A preset time for which the server waits to see if a peer requested a vote. It is randomly chosen between 150-300ms. * Election is repeated after an election timeout until: 1. The server wins the election 2. A peer establishes itself as leader. 3. Election timer times out or a split vote occurs (leading to no leader) and the process will be repeated. * Election win: Majority votes in the term. (More details in safety) The state of the winner is now `Leader` and the others are `Followers`. - * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updates its current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. + * The term problem: Current terms are exchanged when-ever servers communicate; if one server’s current term is smaller than the other’s, then it updates its current term to the larger value. If a candidate or leader discovers that its term is out of date,it immediately reverts to follower state. If a server receives a request with a stale term number, itrejects the request. All terms which are not the current terms are considered _out of date_. * Maintaining leaders reign: The leader sends `heartbeats` to all servers to establish its reign. This also checks whether other servers are alive based on the response and informs other servers that the leader still is alive too. If the servers do not get timely heartbeat messages, they transform from the `follower` state to `candidate` state. - * Transition from working state to Election happens when a leader fails. + * Transition from leader's normal working state to Election happens when a leader fails. * Maintaining sanity: While waiting for votes, if a `AppendEntriesRPC` is received by the server, and the term of the leader is greater than of equal to the waiting node's term, the leader is considered to be legitimate and the waiter becomes a follower of the leader. If the term of the leader is lesser, it is rejected. * The split vote problem: Though not that common, split votes can occur. To make sure this doesn't continue indefinitely, election timeouts are randomised, making the split votes less probable. From c32828cb38860fa6ac92846b3f685b40291e2441 Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 8 May 2020 10:23:39 +0530 Subject: [PATCH 18/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index f00e8473..6bad4811 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -15,17 +15,18 @@ The raft protocol will be implemented in `internal/raft` and will implement APIs Raft is an algorithm to handle replicated log, and we maintain the "log" of the SQL statements applied on a DB and have a completely replicated cluster. -#### General Implementation rules: +#### General Implementation basics: * All RPC calls are done parallely to obtain the best performance. * Request retries are done in case of network failures. * Raft does not assume network preserves ordering of the packets. +* The `raft/raft.go` file has all the state parameters and general functions that raft uses. A raft server may be in any of the 3 states; leader, follower or candidate. All requests are serviced through the leader and it then decides how and if the logs must be replicated in the follower machines. The raft protocol has 3 almost independent modules: 1. Leader Election 2. Log Replication 3. Safety -A detailed description of all the modules follow: +A detailed description of all the modules and their implementation follow: ### Leader Election @@ -48,6 +49,7 @@ A detailed description of all the modules follow: #### Implementation +* The implementation of leader election will span over `raft/leaderElection.go`, request votes over `raft/requestVotes.go` and append entries over `appendEntries.go`. * The raft module will provide a `StartElection` function that enables a node to begin election. This function just begins the election and doesnt return any result of the election. The decision of the election will be handled by the votes and each server independently. * The Leader node is the only one to know its the leader in the beginning. It realises it has obtained the majority votes, and starts behaving like the leader node. During this period, other nodes wait for a possible leader and begin to proceed in the candidate state by advancing to the next term unless the leader contacts them. @@ -72,6 +74,8 @@ A detailed description of all the modules follow: #### Implementation +* Log replication implementation will span over the `raft/logReplication.go` file. + ### Safety This module ensures that the above protocol runs as expected, eliminating the corner cases. @@ -86,7 +90,7 @@ This module ensures that the above protocol runs as expected, eliminating the co #### Implementation -#### Client interaction: +### Client interaction: * Idempotency is maintained by having a unique client ID and the request ID. The same request by the same client cannot be requested twice, we assume here that the client didn't receive the responses due to network errors etc. Each server maintains a _session_ for each client. The session tracks the latest serial number processed for the client, along with the associated response. If a server receives a command whose serial number has already been executed, it responds immediately without re-executing the request. * With each request, the client includes the lowest sequencenumber for which it has not yet received a response, and the state machine then discards all responsesfor lower sequence numbers. Quite similar to TCP. * The session for a client are _open on all nodes_. Session expiry happens on all nodes at once and in a deterministic way. LRU or an upper bound for sessions can be used. Some sort of timely probing is done to remove stale sessions. @@ -98,7 +102,12 @@ This module ensures that the above protocol runs as expected, eliminating the co * The leader waits for its state machine to advance at least as far as the readIndex; this is current enough to satisfy linearizability * Finally, the leader issues the query against its state machine and replies to the client with the results. +### How the modules interact +* Leader election is called by every node at init. +* All nodes send a `RequestVotesRPC` parallely to all other nodes. +* If a leader is elected, the leader begins to send `AppendEntriesRPC` to other nodes (followers) to establish its leadership. +* Log replication occurs when the `AppendEntriesRPC` is received by the follower. ## A strict testing mechanism The testing mechanism to be implemented will enable us in figuring out the problems existing in the implementation leading to a more resilient system. From 6815e8cc9c8d564c0651c6169a33fe397261282f Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 8 May 2020 10:24:55 +0530 Subject: [PATCH 19/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 6bad4811..ec3659b7 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -130,7 +130,7 @@ Accepting failures exist and handling them gracefully enables creation of more r ## Appendix -* The difference between _commit_, _replicate_ and _apply_ with respect raft: What I have understood till now is, applying means letting the log run through the node's state machine. This is the end process, happens after a commit. A commit is once replication happens on a majority of the nodes. While replication is simply appending of a log on _one_ node. -* Some gotchas I thought about: +* The difference between _commit_, _replicate_ and _apply_ with respect to raft: Applying implies letting the log run through the node's state machine. This is the end process, happens after a commit. A commit is once replication happens on a majority of the nodes. While replication is simply appending of a log on _one_ node. +* Some gotchas: * Client connects to the leader and leader crashes -> reject the connection. Let the client connect when the new leader is established. * Some sort of idempotency must be maintained w.r.t. the client-cluster communication. Multiple requests submitted by the client should not cause problems due to network errors. From 281906e69da1f45f67dd9a98ad68f449948004af Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 8 May 2020 13:41:52 +0530 Subject: [PATCH 20/23] Update doc/internal/consensus/Consensus-Protocol.md Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index ec3659b7..a50c7cae 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -49,7 +49,7 @@ A detailed description of all the modules and their implementation follow: #### Implementation -* The implementation of leader election will span over `raft/leaderElection.go`, request votes over `raft/requestVotes.go` and append entries over `appendEntries.go`. +* The implementation of leader election will span over `raft/leader_election.go`, request votes over `raft/request_votes.go` and append entries over `append_entries.go`. * The raft module will provide a `StartElection` function that enables a node to begin election. This function just begins the election and doesnt return any result of the election. The decision of the election will be handled by the votes and each server independently. * The Leader node is the only one to know its the leader in the beginning. It realises it has obtained the majority votes, and starts behaving like the leader node. During this period, other nodes wait for a possible leader and begin to proceed in the candidate state by advancing to the next term unless the leader contacts them. From b180d337dd97516b20730cd7513edd6ffe69e8ed Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Fri, 8 May 2020 13:42:01 +0530 Subject: [PATCH 21/23] Update doc/internal/consensus/Consensus-Protocol.md Co-authored-by: Tim Satke <48135919+TimSatke@users.noreply.github.com> --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index a50c7cae..efcc2f90 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -74,7 +74,7 @@ A detailed description of all the modules and their implementation follow: #### Implementation -* Log replication implementation will span over the `raft/logReplication.go` file. +* Log replication implementation will span over the `raft/log_replication.go` file. ### Safety From fd1f272b3ad5bb28983d9552ac17be6995353fff Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Sun, 10 May 2020 15:09:04 +0530 Subject: [PATCH 22/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index efcc2f90..474b15db 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -64,7 +64,7 @@ A detailed description of all the modules and their implementation follow: * A committed entry: When a leader decides that the log entry is safe to apply to other state machines, that entry is called committed. All committed entries are durable and _will eventually be executed_ by all state machines. * An entry -> Committed entry: A log entry is called committed once its replicated on the majority of the servers in the cluster. Once an entry is committed, it commits all the previous entries in the leaders log, including the entries created by the previous leaders. * The leader keeps track of the highest known index that it knows is committed and it is included in all the future `AppendEntriesRPC` (including heartbeats) to inform other servers. -* Theres some issue about log committing - "A log entry is committed once the leader that createdthe entry has replicated it on a majority of the servers" and " Once a follower learns that a log entry is committed, it applies theentry to its local state machine (in log order)." are not clear whether replicating and applying to state machine are the same. If they are its kind of a contradiction, else "application" can mean executing the STMT in the DB in our case. +* Theres some issue about log committing - "A log entry is committed once the leader that created the entry has replicated it on a majority of the servers" and " Once a follower learns that a log entry is committed, it applies theentry to its local state machine (in log order)." are not clear whether replicating and applying to state machine are the same. If they are its kind of a contradiction, else "application" can mean executing the STMT in the DB in our case. * Log matching property: * If two entries in different logs have the same index and term, then they store the same command. * If two entries in different logs have the same index and term, then the logs are identical in all preceding entries. From a16b909e9684419d76f7014c9d73e56b7a917bbf Mon Sep 17 00:00:00 2001 From: Sumukha Pk Date: Sun, 17 May 2020 11:32:14 +0530 Subject: [PATCH 23/23] Update Consensus-Protocol.md --- doc/internal/consensus/Consensus-Protocol.md | 74 ++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/doc/internal/consensus/Consensus-Protocol.md b/doc/internal/consensus/Consensus-Protocol.md index 474b15db..377289ec 100644 --- a/doc/internal/consensus/Consensus-Protocol.md +++ b/doc/internal/consensus/Consensus-Protocol.md @@ -20,6 +20,80 @@ Raft is an algorithm to handle replicated log, and we maintain the "log" of the * Request retries are done in case of network failures. * Raft does not assume network preserves ordering of the packets. * The `raft/raft.go` file has all the state parameters and general functions that raft uses. +* A raft server is implemented as: +``` +type simpleServer struct { + node *Node + cluster Cluster + onReplication ReplicationHandler + log zerolog.Logger +} + +type Node struct { + State string + + PersistentState *PersistentState + VolatileState *VolatileState + VolatileStateLeader *VolatileStateLeader +} + +// PersistentState describes the persistent state data on a raft node. +type PersistentState struct { + CurrentTerm int32 + VotedFor id.ID // VotedFor is nil at init, and id.ID of the node after voting is complete. + Log []*message.LogData + + SelfID id.ID + LeaderID id.ID // LeaderID is nil at init, and the id.ID of the node after the leader is elected. + PeerIPs []network.Conn // PeerIPs has the connection variables of all the other nodes in the cluster. + mu sync.Mutex +} + +// VolatileState describes the volatile state data on a raft node. +type VolatileState struct { + CommitIndex int + LastApplied int +} + +// VolatileStateLeader describes the volatile state data that exists on a raft leader. +type VolatileStateLeader struct { + NextIndex []int // Holds the nextIndex value for each of the followers in the cluster. + MatchIndex []int // Holds the matchIndex value for each of the followers in the cluster. +} +``` +* A raft node must always be listening to incoming requests; in absence of any, it starts a leader election after a timeout. This is implemented as: +``` +go func() { + for { + // Parallely start waiting for incoming data. + conn, msg, err := s.cluster.Receive(ctx) + liveChan <- &incomingData{ + conn, + msg, + } + if err != nil { + return + } + } + }() + + for { + // If any sort of request (heartbeat,appendEntries,requestVote) + // isn't received by the server(node) it restarts leader election. + select { + case <-randomTimer().C: + err = StartElection(node) + if err != nil { + return + } + case data := <-liveChan: + err = processIncomingData(data, node) + if err != nil { + return + } + } + } +``` A raft server may be in any of the 3 states; leader, follower or candidate. All requests are serviced through the leader and it then decides how and if the logs must be replicated in the follower machines. The raft protocol has 3 almost independent modules: 1. Leader Election