diff options
Diffstat (limited to 'vendor/github.com/hashicorp/memberlist')
41 files changed, 10877 insertions, 0 deletions
diff --git a/vendor/github.com/hashicorp/memberlist/.gitignore b/vendor/github.com/hashicorp/memberlist/.gitignore new file mode 100644 index 000000000..9158f171a --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/.gitignore @@ -0,0 +1,25 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test +.vagrant/ + diff --git a/vendor/github.com/hashicorp/memberlist/LICENSE b/vendor/github.com/hashicorp/memberlist/LICENSE new file mode 100644 index 000000000..c33dcc7c9 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/vendor/github.com/hashicorp/memberlist/Makefile b/vendor/github.com/hashicorp/memberlist/Makefile new file mode 100644 index 000000000..56ef6c28c --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/Makefile @@ -0,0 +1,14 @@ +test: subnet + go test ./... + +integ: subnet + INTEG_TESTS=yes go test ./... + +subnet: + ./test/setup_subnet.sh + +cov: + gocov test github.com/hashicorp/memberlist | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PNONY: test cov integ diff --git a/vendor/github.com/hashicorp/memberlist/README.md b/vendor/github.com/hashicorp/memberlist/README.md new file mode 100644 index 000000000..fc605a59b --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/README.md @@ -0,0 +1,144 @@ +# memberlist [![GoDoc](https://godoc.org/github.com/hashicorp/memberlist?status.png)](https://godoc.org/github.com/hashicorp/memberlist) + +memberlist is a [Go](http://www.golang.org) library that manages cluster +membership and member failure detection using a gossip based protocol. + +The use cases for such a library are far-reaching: all distributed systems +require membership, and memberlist is a re-usable solution to managing +cluster membership and node failure detection. + +memberlist is eventually consistent but converges quickly on average. +The speed at which it converges can be heavily tuned via various knobs +on the protocol. Node failures are detected and network partitions are partially +tolerated by attempting to communicate to potentially dead nodes through +multiple routes. + +## Building + +If you wish to build memberlist you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Usage + +Memberlist is surprisingly simple to use. An example is shown below: + +```go +/* Create the initial memberlist from a safe configuration. + Please reference the godoc for other default config types. + http://godoc.org/github.com/hashicorp/memberlist#Config +*/ +list, err := memberlist.Create(memberlist.DefaultLocalConfig()) +if err != nil { + panic("Failed to create memberlist: " + err.Error()) +} + +// Join an existing cluster by specifying at least one known member. +n, err := list.Join([]string{"1.2.3.4"}) +if err != nil { + panic("Failed to join cluster: " + err.Error()) +} + +// Ask for members of the cluster +for _, member := range list.Members() { + fmt.Printf("Member: %s %s\n", member.Name, member.Addr) +} + +// Continue doing whatever you need, memberlist will maintain membership +// information in the background. Delegates can be used for receiving +// events when members join or leave. +``` + +The most difficult part of memberlist is configuring it since it has many +available knobs in order to tune state propagation delay and convergence times. +Memberlist provides a default configuration that offers a good starting point, +but errs on the side of caution, choosing values that are optimized for +higher convergence at the cost of higher bandwidth usage. + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/memberlist). + +## Protocol + +memberlist is based on ["SWIM: Scalable Weakly-consistent Infection-style Process Group Membership Protocol"](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf), +with a few minor adaptations, mostly to increase propagation speed and +convergence rate. + +A high level overview of the memberlist protocol (based on SWIM) is +described below, but for details please read the full +[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) +followed by the memberlist source. We welcome any questions related +to the protocol on our issue tracker. + +### Protocol Description + +memberlist begins by joining an existing cluster or starting a new +cluster. If starting a new cluster, additional nodes are expected to join +it. New nodes in an existing cluster must be given the address of at +least one existing member in order to join the cluster. The new member +does a full state sync with the existing member over TCP and begins gossiping its +existence to the cluster. + +Gossip is done over UDP with a configurable but fixed fanout and interval. +This ensures that network usage is constant with regards to number of nodes, as opposed to +exponential growth that can occur with traditional heartbeat mechanisms. +Complete state exchanges with a random node are done periodically over +TCP, but much less often than gossip messages. This increases the likelihood +that the membership list converges properly since the full state is exchanged +and merged. The interval between full state exchanges is configurable or can +be disabled entirely. + +Failure detection is done by periodic random probing using a configurable interval. +If the node fails to ack within a reasonable time (typically some multiple +of RTT), then an indirect probe as well as a direct TCP probe are attempted. An +indirect probe asks a configurable number of random nodes to probe the same node, +in case there are network issues causing our own node to fail the probe. The direct +TCP probe is used to help identify the common situation where networking is +misconfigured to allow TCP but not UDP. Without the TCP probe, a UDP-isolated node +would think all other nodes were suspect and could cause churn in the cluster when +it attempts a TCP-based state exchange with another node. It is not desirable to +operate with only TCP connectivity because convergence will be much slower, but it +is enabled so that memberlist can detect this situation and alert operators. + +If both our probe, the indirect probes, and the direct TCP probe fail within a +configurable time, then the node is marked "suspicious" and this knowledge is +gossiped to the cluster. A suspicious node is still considered a member of +cluster. If the suspect member of the cluster does not dispute the suspicion +within a configurable period of time, the node is finally considered dead, +and this state is then gossiped to the cluster. + +This is a brief and incomplete description of the protocol. For a better idea, +please read the +[SWIM paper](http://www.cs.cornell.edu/~asdas/research/dsn02-swim.pdf) +in its entirety, along with the memberlist source code. + +### Changes from SWIM + +As mentioned earlier, the memberlist protocol is based on SWIM but includes +minor changes, mostly to increase propagation speed and convergence rates. + +The changes from SWIM are noted here: + +* memberlist does a full state sync over TCP periodically. SWIM only propagates + changes over gossip. While both eventually reach convergence, the full state + sync increases the likelihood that nodes are fully converged more quickly, + at the expense of more bandwidth usage. This feature can be totally disabled + if you wish. + +* memberlist has a dedicated gossip layer separate from the failure detection + protocol. SWIM only piggybacks gossip messages on top of probe/ack messages. + memberlist also piggybacks gossip messages on top of probe/ack messages, but + also will periodically send out dedicated gossip messages on their own. This + feature lets you have a higher gossip rate (for example once per 200ms) + and a slower failure detection rate (such as once per second), resulting + in overall faster convergence rates and data propagation speeds. This feature + can be totally disabed as well, if you wish. + +* memberlist stores around the state of dead nodes for a set amount of time, + so that when full syncs are requested, the requester also receives information + about dead nodes. Because SWIM doesn't do full syncs, SWIM deletes dead node + state immediately upon learning that the node is dead. This change again helps + the cluster converge more quickly. diff --git a/vendor/github.com/hashicorp/memberlist/alive_delegate.go b/vendor/github.com/hashicorp/memberlist/alive_delegate.go new file mode 100644 index 000000000..51a0ba905 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/alive_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +// AliveDelegate is used to involve a client in processing +// a node "alive" message. When a node joins, either through +// a UDP gossip or TCP push/pull, we update the state of +// that node via an alive message. This can be used to filter +// a node out and prevent it from being considered a peer +// using application specific logic. +type AliveDelegate interface { + // NotifyMerge is invoked when a merge could take place. + // Provides a list of the nodes known by the peer. If + // the return value is non-nil, the merge is canceled. + NotifyAlive(peer *Node) error +} diff --git a/vendor/github.com/hashicorp/memberlist/awareness.go b/vendor/github.com/hashicorp/memberlist/awareness.go new file mode 100644 index 000000000..ea95c7538 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/awareness.go @@ -0,0 +1,69 @@ +package memberlist + +import ( + "sync" + "time" + + "github.com/armon/go-metrics" +) + +// awareness manages a simple metric for tracking the estimated health of the +// local node. Health is primary the node's ability to respond in the soft +// real-time manner required for correct health checking of other nodes in the +// cluster. +type awareness struct { + sync.RWMutex + + // max is the upper threshold for the timeout scale (the score will be + // constrained to be from 0 <= score < max). + max int + + // score is the current awareness score. Lower values are healthier and + // zero is the minimum value. + score int +} + +// newAwareness returns a new awareness object. +func newAwareness(max int) *awareness { + return &awareness{ + max: max, + score: 0, + } +} + +// ApplyDelta takes the given delta and applies it to the score in a thread-safe +// manner. It also enforces a floor of zero and a max of max, so deltas may not +// change the overall score if it's railed at one of the extremes. +func (a *awareness) ApplyDelta(delta int) { + a.Lock() + initial := a.score + a.score += delta + if a.score < 0 { + a.score = 0 + } else if a.score > (a.max - 1) { + a.score = (a.max - 1) + } + final := a.score + a.Unlock() + + if initial != final { + metrics.SetGauge([]string{"memberlist", "health", "score"}, float32(final)) + } +} + +// GetHealthScore returns the raw health score. +func (a *awareness) GetHealthScore() int { + a.RLock() + score := a.score + a.RUnlock() + return score +} + +// ScaleTimeout takes the given duration and scales it based on the current +// score. Less healthyness will lead to longer timeouts. +func (a *awareness) ScaleTimeout(timeout time.Duration) time.Duration { + a.RLock() + score := a.score + a.RUnlock() + return timeout * (time.Duration(score) + 1) +} diff --git a/vendor/github.com/hashicorp/memberlist/awareness_test.go b/vendor/github.com/hashicorp/memberlist/awareness_test.go new file mode 100644 index 000000000..c6ade10af --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/awareness_test.go @@ -0,0 +1,41 @@ +package memberlist + +import ( + "testing" + "time" +) + +func TestAwareness(t *testing.T) { + cases := []struct { + delta int + score int + timeout time.Duration + }{ + {0, 0, 1 * time.Second}, + {-1, 0, 1 * time.Second}, + {-10, 0, 1 * time.Second}, + {1, 1, 2 * time.Second}, + {-1, 0, 1 * time.Second}, + {10, 7, 8 * time.Second}, + {-1, 6, 7 * time.Second}, + {-1, 5, 6 * time.Second}, + {-1, 4, 5 * time.Second}, + {-1, 3, 4 * time.Second}, + {-1, 2, 3 * time.Second}, + {-1, 1, 2 * time.Second}, + {-1, 0, 1 * time.Second}, + {-1, 0, 1 * time.Second}, + } + + a := newAwareness(8) + for i, c := range cases { + a.ApplyDelta(c.delta) + if a.GetHealthScore() != c.score { + t.Errorf("case %d: score mismatch %d != %d", i, a.score, c.score) + } + if timeout := a.ScaleTimeout(1 * time.Second); timeout != c.timeout { + t.Errorf("case %d: scaled timeout mismatch %9.6f != %9.6f", + i, timeout.Seconds(), c.timeout.Seconds()) + } + } +} diff --git a/vendor/github.com/hashicorp/memberlist/broadcast.go b/vendor/github.com/hashicorp/memberlist/broadcast.go new file mode 100644 index 000000000..f7e85a119 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/broadcast.go @@ -0,0 +1,100 @@ +package memberlist + +/* +The broadcast mechanism works by maintaining a sorted list of messages to be +sent out. When a message is to be broadcast, the retransmit count +is set to zero and appended to the queue. The retransmit count serves +as the "priority", ensuring that newer messages get sent first. Once +a message hits the retransmit limit, it is removed from the queue. + +Additionally, older entries can be invalidated by new messages that +are contradictory. For example, if we send "{suspect M1 inc: 1}, +then a following {alive M1 inc: 2} will invalidate that message +*/ + +type memberlistBroadcast struct { + node string + msg []byte + notify chan struct{} +} + +func (b *memberlistBroadcast) Invalidates(other Broadcast) bool { + // Check if that broadcast is a memberlist type + mb, ok := other.(*memberlistBroadcast) + if !ok { + return false + } + + // Invalidates any message about the same node + return b.node == mb.node +} + +func (b *memberlistBroadcast) Message() []byte { + return b.msg +} + +func (b *memberlistBroadcast) Finished() { + select { + case b.notify <- struct{}{}: + default: + } +} + +// encodeAndBroadcast encodes a message and enqueues it for broadcast. Fails +// silently if there is an encoding error. +func (m *Memberlist) encodeAndBroadcast(node string, msgType messageType, msg interface{}) { + m.encodeBroadcastNotify(node, msgType, msg, nil) +} + +// encodeBroadcastNotify encodes a message and enqueues it for broadcast +// and notifies the given channel when transmission is finished. Fails +// silently if there is an encoding error. +func (m *Memberlist) encodeBroadcastNotify(node string, msgType messageType, msg interface{}, notify chan struct{}) { + buf, err := encode(msgType, msg) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode message for broadcast: %s", err) + } else { + m.queueBroadcast(node, buf.Bytes(), notify) + } +} + +// queueBroadcast is used to start dissemination of a message. It will be +// sent up to a configured number of times. The message could potentially +// be invalidated by a future message about the same node +func (m *Memberlist) queueBroadcast(node string, msg []byte, notify chan struct{}) { + b := &memberlistBroadcast{node, msg, notify} + m.broadcasts.QueueBroadcast(b) +} + +// getBroadcasts is used to return a slice of broadcasts to send up to +// a maximum byte size, while imposing a per-broadcast overhead. This is used +// to fill a UDP packet with piggybacked data +func (m *Memberlist) getBroadcasts(overhead, limit int) [][]byte { + // Get memberlist messages first + toSend := m.broadcasts.GetBroadcasts(overhead, limit) + + // Check if the user has anything to broadcast + d := m.config.Delegate + if d != nil { + // Determine the bytes used already + bytesUsed := 0 + for _, msg := range toSend { + bytesUsed += len(msg) + overhead + } + + // Check space remaining for user messages + avail := limit - bytesUsed + if avail > overhead+userMsgOverhead { + userMsgs := d.GetBroadcasts(overhead+userMsgOverhead, avail) + + // Frame each user message + for _, msg := range userMsgs { + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + toSend = append(toSend, buf) + } + } + } + return toSend +} diff --git a/vendor/github.com/hashicorp/memberlist/broadcast_test.go b/vendor/github.com/hashicorp/memberlist/broadcast_test.go new file mode 100644 index 000000000..c6a7302cc --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/broadcast_test.go @@ -0,0 +1,27 @@ +package memberlist + +import ( + "reflect" + "testing" +) + +func TestMemberlistBroadcast_Invalidates(t *testing.T) { + m1 := &memberlistBroadcast{"test", nil, nil} + m2 := &memberlistBroadcast{"foo", nil, nil} + + if m1.Invalidates(m2) || m2.Invalidates(m1) { + t.Fatalf("unexpected invalidation") + } + + if !m1.Invalidates(m1) { + t.Fatalf("expected invalidation") + } +} + +func TestMemberlistBroadcast_Message(t *testing.T) { + m1 := &memberlistBroadcast{"test", []byte("test"), nil} + msg := m1.Message() + if !reflect.DeepEqual(msg, []byte("test")) { + t.Fatalf("messages do not match") + } +} diff --git a/vendor/github.com/hashicorp/memberlist/config.go b/vendor/github.com/hashicorp/memberlist/config.go new file mode 100644 index 000000000..2f43d14cb --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/config.go @@ -0,0 +1,288 @@ +package memberlist + +import ( + "io" + "log" + "os" + "time" +) + +type Config struct { + // The name of this node. This must be unique in the cluster. + Name string + + // Transport is a hook for providing custom code to communicate with + // other nodes. If this is left nil, then memberlist will by default + // make a NetTransport using BindAddr and BindPort from this structure. + Transport Transport + + // Configuration related to what address to bind to and ports to + // listen on. The port is used for both UDP and TCP gossip. It is + // assumed other nodes are running on this port, but they do not need + // to. + BindAddr string + BindPort int + + // Configuration related to what address to advertise to other + // cluster members. Used for nat traversal. + AdvertiseAddr string + AdvertisePort int + + // ProtocolVersion is the configured protocol version that we + // will _speak_. This must be between ProtocolVersionMin and + // ProtocolVersionMax. + ProtocolVersion uint8 + + // TCPTimeout is the timeout for establishing a stream connection with + // a remote node for a full state sync, and for stream read and write + // operations. This is a legacy name for backwards compatibility, but + // should really be called StreamTimeout now that we have generalized + // the transport. + TCPTimeout time.Duration + + // IndirectChecks is the number of nodes that will be asked to perform + // an indirect probe of a node in the case a direct probe fails. Memberlist + // waits for an ack from any single indirect node, so increasing this + // number will increase the likelihood that an indirect probe will succeed + // at the expense of bandwidth. + IndirectChecks int + + // RetransmitMult is the multiplier for the number of retransmissions + // that are attempted for messages broadcasted over gossip. The actual + // count of retransmissions is calculated using the formula: + // + // Retransmits = RetransmitMult * log(N+1) + // + // This allows the retransmits to scale properly with cluster size. The + // higher the multiplier, the more likely a failed broadcast is to converge + // at the expense of increased bandwidth. + RetransmitMult int + + // SuspicionMult is the multiplier for determining the time an + // inaccessible node is considered suspect before declaring it dead. + // The actual timeout is calculated using the formula: + // + // SuspicionTimeout = SuspicionMult * log(N+1) * ProbeInterval + // + // This allows the timeout to scale properly with expected propagation + // delay with a larger cluster size. The higher the multiplier, the longer + // an inaccessible node is considered part of the cluster before declaring + // it dead, giving that suspect node more time to refute if it is indeed + // still alive. + SuspicionMult int + + // SuspicionMaxTimeoutMult is the multiplier applied to the + // SuspicionTimeout used as an upper bound on detection time. This max + // timeout is calculated using the formula: + // + // SuspicionMaxTimeout = SuspicionMaxTimeoutMult * SuspicionTimeout + // + // If everything is working properly, confirmations from other nodes will + // accelerate suspicion timers in a manner which will cause the timeout + // to reach the base SuspicionTimeout before that elapses, so this value + // will typically only come into play if a node is experiencing issues + // communicating with other nodes. It should be set to a something fairly + // large so that a node having problems will have a lot of chances to + // recover before falsely declaring other nodes as failed, but short + // enough for a legitimately isolated node to still make progress marking + // nodes failed in a reasonable amount of time. + SuspicionMaxTimeoutMult int + + // PushPullInterval is the interval between complete state syncs. + // Complete state syncs are done with a single node over TCP and are + // quite expensive relative to standard gossiped messages. Setting this + // to zero will disable state push/pull syncs completely. + // + // Setting this interval lower (more frequent) will increase convergence + // speeds across larger clusters at the expense of increased bandwidth + // usage. + PushPullInterval time.Duration + + // ProbeInterval and ProbeTimeout are used to configure probing + // behavior for memberlist. + // + // ProbeInterval is the interval between random node probes. Setting + // this lower (more frequent) will cause the memberlist cluster to detect + // failed nodes more quickly at the expense of increased bandwidth usage. + // + // ProbeTimeout is the timeout to wait for an ack from a probed node + // before assuming it is unhealthy. This should be set to 99-percentile + // of RTT (round-trip time) on your network. + ProbeInterval time.Duration + ProbeTimeout time.Duration + + // DisableTcpPings will turn off the fallback TCP pings that are attempted + // if the direct UDP ping fails. These get pipelined along with the + // indirect UDP pings. + DisableTcpPings bool + + // AwarenessMaxMultiplier will increase the probe interval if the node + // becomes aware that it might be degraded and not meeting the soft real + // time requirements to reliably probe other nodes. + AwarenessMaxMultiplier int + + // GossipInterval and GossipNodes are used to configure the gossip + // behavior of memberlist. + // + // GossipInterval is the interval between sending messages that need + // to be gossiped that haven't been able to piggyback on probing messages. + // If this is set to zero, non-piggyback gossip is disabled. By lowering + // this value (more frequent) gossip messages are propagated across + // the cluster more quickly at the expense of increased bandwidth. + // + // GossipNodes is the number of random nodes to send gossip messages to + // per GossipInterval. Increasing this number causes the gossip messages + // to propagate across the cluster more quickly at the expense of + // increased bandwidth. + // + // GossipToTheDeadTime is the interval after which a node has died that + // we will still try to gossip to it. This gives it a chance to refute. + GossipInterval time.Duration + GossipNodes int + GossipToTheDeadTime time.Duration + + // EnableCompression is used to control message compression. This can + // be used to reduce bandwidth usage at the cost of slightly more CPU + // utilization. This is only available starting at protocol version 1. + EnableCompression bool + + // SecretKey is used to initialize the primary encryption key in a keyring. + // The primary encryption key is the only key used to encrypt messages and + // the first key used while attempting to decrypt messages. Providing a + // value for this primary key will enable message-level encryption and + // verification, and automatically install the key onto the keyring. + // The value should be either 16, 24, or 32 bytes to select AES-128, + // AES-192, or AES-256. + SecretKey []byte + + // The keyring holds all of the encryption keys used internally. It is + // automatically initialized using the SecretKey and SecretKeys values. + Keyring *Keyring + + // Delegate and Events are delegates for receiving and providing + // data to memberlist via callback mechanisms. For Delegate, see + // the Delegate interface. For Events, see the EventDelegate interface. + // + // The DelegateProtocolMin/Max are used to guarantee protocol-compatibility + // for any custom messages that the delegate might do (broadcasts, + // local/remote state, etc.). If you don't set these, then the protocol + // versions will just be zero, and version compliance won't be done. + Delegate Delegate + DelegateProtocolVersion uint8 + DelegateProtocolMin uint8 + DelegateProtocolMax uint8 + Events EventDelegate + Conflict ConflictDelegate + Merge MergeDelegate + Ping PingDelegate + Alive AliveDelegate + + // DNSConfigPath points to the system's DNS config file, usually located + // at /etc/resolv.conf. It can be overridden via config for easier testing. + DNSConfigPath string + + // LogOutput is the writer where logs should be sent. If this is not + // set, logging will go to stderr by default. You cannot specify both LogOutput + // and Logger at the same time. + LogOutput io.Writer + + // Logger is a custom logger which you provide. If Logger is set, it will use + // this for the internal logger. If Logger is not set, it will fall back to the + // behavior for using LogOutput. You cannot specify both LogOutput and Logger + // at the same time. + Logger *log.Logger + + // Size of Memberlist's internal channel which handles UDP messages. The + // size of this determines the size of the queue which Memberlist will keep + // while UDP messages are handled. + HandoffQueueDepth int + + // Maximum number of bytes that memberlist will put in a packet (this + // will be for UDP packets by default with a NetTransport). A safe value + // for this is typically 1400 bytes (which is the default). However, + // depending on your network's MTU (Maximum Transmission Unit) you may + // be able to increase this to get more content into each gossip packet. + // This is a legacy name for backward compatibility but should really be + // called PacketBufferSize now that we have generalized the transport. + UDPBufferSize int +} + +// DefaultLANConfig returns a sane set of configurations for Memberlist. +// It uses the hostname as the node name, and otherwise sets very conservative +// values that are sane for most LAN environments. The default configuration +// errs on the side of caution, choosing values that are optimized +// for higher convergence at the cost of higher bandwidth usage. Regardless, +// these values are a good starting point when getting started with memberlist. +func DefaultLANConfig() *Config { + hostname, _ := os.Hostname() + return &Config{ + Name: hostname, + BindAddr: "0.0.0.0", + BindPort: 7946, + AdvertiseAddr: "", + AdvertisePort: 7946, + ProtocolVersion: ProtocolVersion2Compatible, + TCPTimeout: 10 * time.Second, // Timeout after 10 seconds + IndirectChecks: 3, // Use 3 nodes for the indirect ping + RetransmitMult: 4, // Retransmit a message 4 * log(N+1) nodes + SuspicionMult: 5, // Suspect a node for 5 * log(N+1) * Interval + SuspicionMaxTimeoutMult: 6, // For 10k nodes this will give a max timeout of 120 seconds + PushPullInterval: 30 * time.Second, // Low frequency + ProbeTimeout: 500 * time.Millisecond, // Reasonable RTT time for LAN + ProbeInterval: 1 * time.Second, // Failure check every second + DisableTcpPings: false, // TCP pings are safe, even with mixed versions + AwarenessMaxMultiplier: 8, // Probe interval backs off to 8 seconds + + GossipNodes: 3, // Gossip to 3 nodes + GossipInterval: 200 * time.Millisecond, // Gossip more rapidly + GossipToTheDeadTime: 30 * time.Second, // Same as push/pull + + EnableCompression: true, // Enable compression by default + + SecretKey: nil, + Keyring: nil, + + DNSConfigPath: "/etc/resolv.conf", + + HandoffQueueDepth: 1024, + UDPBufferSize: 1400, + } +} + +// DefaultWANConfig works like DefaultConfig, however it returns a configuration +// that is optimized for most WAN environments. The default configuration is +// still very conservative and errs on the side of caution. +func DefaultWANConfig() *Config { + conf := DefaultLANConfig() + conf.TCPTimeout = 30 * time.Second + conf.SuspicionMult = 6 + conf.PushPullInterval = 60 * time.Second + conf.ProbeTimeout = 3 * time.Second + conf.ProbeInterval = 5 * time.Second + conf.GossipNodes = 4 // Gossip less frequently, but to an additional node + conf.GossipInterval = 500 * time.Millisecond + conf.GossipToTheDeadTime = 60 * time.Second + return conf +} + +// DefaultLocalConfig works like DefaultConfig, however it returns a configuration +// that is optimized for a local loopback environments. The default configuration is +// still very conservative and errs on the side of caution. +func DefaultLocalConfig() *Config { + conf := DefaultLANConfig() + conf.TCPTimeout = time.Second + conf.IndirectChecks = 1 + conf.RetransmitMult = 2 + conf.SuspicionMult = 3 + conf.PushPullInterval = 15 * time.Second + conf.ProbeTimeout = 200 * time.Millisecond + conf.ProbeInterval = time.Second + conf.GossipInterval = 100 * time.Millisecond + conf.GossipToTheDeadTime = 15 * time.Second + return conf +} + +// Returns whether or not encryption is enabled +func (c *Config) EncryptionEnabled() bool { + return c.Keyring != nil && len(c.Keyring.GetKeys()) > 0 +} diff --git a/vendor/github.com/hashicorp/memberlist/conflict_delegate.go b/vendor/github.com/hashicorp/memberlist/conflict_delegate.go new file mode 100644 index 000000000..f52b136eb --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/conflict_delegate.go @@ -0,0 +1,10 @@ +package memberlist + +// ConflictDelegate is a used to inform a client that +// a node has attempted to join which would result in a +// name conflict. This happens if two clients are configured +// with the same name but different addresses. +type ConflictDelegate interface { + // NotifyConflict is invoked when a name conflict is detected + NotifyConflict(existing, other *Node) +} diff --git a/vendor/github.com/hashicorp/memberlist/delegate.go b/vendor/github.com/hashicorp/memberlist/delegate.go new file mode 100644 index 000000000..551548892 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/delegate.go @@ -0,0 +1,37 @@ +package memberlist + +// Delegate is the interface that clients must implement if they want to hook +// into the gossip layer of Memberlist. All the methods must be thread-safe, +// as they can and generally will be called concurrently. +type Delegate interface { + // NodeMeta is used to retrieve meta-data about the current node + // when broadcasting an alive message. It's length is limited to + // the given byte size. This metadata is available in the Node structure. + NodeMeta(limit int) []byte + + // NotifyMsg is called when a user-data message is received. + // Care should be taken that this method does not block, since doing + // so would block the entire UDP packet receive loop. Additionally, the byte + // slice may be modified after the call returns, so it should be copied if needed + NotifyMsg([]byte) + + // GetBroadcasts is called when user data messages can be broadcast. + // It can return a list of buffers to send. Each buffer should assume an + // overhead as provided with a limit on the total byte size allowed. + // The total byte size of the resulting data to send must not exceed + // the limit. Care should be taken that this method does not block, + // since doing so would block the entire UDP packet receive loop. + GetBroadcasts(overhead, limit int) [][]byte + + // LocalState is used for a TCP Push/Pull. This is sent to + // the remote side in addition to the membership information. Any + // data can be sent here. See MergeRemoteState as well. The `join` + // boolean indicates this is for a join instead of a push/pull. + LocalState(join bool) []byte + + // MergeRemoteState is invoked after a TCP Push/Pull. This is the + // state received from the remote side and is the result of the + // remote side's LocalState call. The 'join' + // boolean indicates this is for a join instead of a push/pull. + MergeRemoteState(buf []byte, join bool) +} diff --git a/vendor/github.com/hashicorp/memberlist/event_delegate.go b/vendor/github.com/hashicorp/memberlist/event_delegate.go new file mode 100644 index 000000000..35e2a56fd --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/event_delegate.go @@ -0,0 +1,61 @@ +package memberlist + +// EventDelegate is a simpler delegate that is used only to receive +// notifications about members joining and leaving. The methods in this +// delegate may be called by multiple goroutines, but never concurrently. +// This allows you to reason about ordering. +type EventDelegate interface { + // NotifyJoin is invoked when a node is detected to have joined. + // The Node argument must not be modified. + NotifyJoin(*Node) + + // NotifyLeave is invoked when a node is detected to have left. + // The Node argument must not be modified. + NotifyLeave(*Node) + + // NotifyUpdate is invoked when a node is detected to have + // updated, usually involving the meta data. The Node argument + // must not be modified. + NotifyUpdate(*Node) +} + +// ChannelEventDelegate is used to enable an application to receive +// events about joins and leaves over a channel instead of a direct +// function call. +// +// Care must be taken that events are processed in a timely manner from +// the channel, since this delegate will block until an event can be sent. +type ChannelEventDelegate struct { + Ch chan<- NodeEvent +} + +// NodeEventType are the types of events that can be sent from the +// ChannelEventDelegate. +type NodeEventType int + +const ( + NodeJoin NodeEventType = iota + NodeLeave + NodeUpdate +) + +// NodeEvent is a single event related to node activity in the memberlist. +// The Node member of this struct must not be directly modified. It is passed +// as a pointer to avoid unnecessary copies. If you wish to modify the node, +// make a copy first. +type NodeEvent struct { + Event NodeEventType + Node *Node +} + +func (c *ChannelEventDelegate) NotifyJoin(n *Node) { + c.Ch <- NodeEvent{NodeJoin, n} +} + +func (c *ChannelEventDelegate) NotifyLeave(n *Node) { + c.Ch <- NodeEvent{NodeLeave, n} +} + +func (c *ChannelEventDelegate) NotifyUpdate(n *Node) { + c.Ch <- NodeEvent{NodeUpdate, n} +} diff --git a/vendor/github.com/hashicorp/memberlist/integ_test.go b/vendor/github.com/hashicorp/memberlist/integ_test.go new file mode 100644 index 000000000..f519c6baa --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/integ_test.go @@ -0,0 +1,89 @@ +package memberlist + +import ( + "fmt" + "log" + "os" + "testing" + "time" +) + +// CheckInteg will skip a test if integration testing is not enabled. +func CheckInteg(t *testing.T) { + if !IsInteg() { + t.SkipNow() + } +} + +// IsInteg returns a boolean telling you if we're in integ testing mode. +func IsInteg() bool { + return os.Getenv("INTEG_TESTS") != "" +} + +// Tests the memberlist by creating a cluster of 100 nodes +// and checking that we get strong convergence of changes. +func TestMemberlist_Integ(t *testing.T) { + CheckInteg(t) + + num := 16 + var members []*Memberlist + + secret := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + eventCh := make(chan NodeEvent, num) + + addr := "127.0.0.1" + for i := 0; i < num; i++ { + c := DefaultLANConfig() + c.Name = fmt.Sprintf("%s:%d", addr, 12345+i) + c.BindAddr = addr + c.BindPort = 12345 + i + c.ProbeInterval = 20 * time.Millisecond + c.ProbeTimeout = 100 * time.Millisecond + c.GossipInterval = 20 * time.Millisecond + c.PushPullInterval = 200 * time.Millisecond + c.SecretKey = secret + + if i == 0 { + c.Events = &ChannelEventDelegate{eventCh} + } + + m, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + members = append(members, m) + defer m.Shutdown() + + if i > 0 { + last := members[i-1] + num, err := m.Join([]string{last.config.Name}) + if num == 0 || err != nil { + t.Fatalf("unexpected err: %s", err) + } + } + } + + // Wait and print debug info + breakTimer := time.After(250 * time.Millisecond) +WAIT: + for { + select { + case e := <-eventCh: + if e.Event == NodeJoin { + log.Printf("[DEBUG] Node join: %v (%d)", *e.Node, members[0].NumMembers()) + } else { + log.Printf("[DEBUG] Node leave: %v (%d)", *e.Node, members[0].NumMembers()) + } + case <-breakTimer: + break WAIT + } + } + + for idx, m := range members { + got := m.NumMembers() + if got != num { + t.Errorf("bad num members at idx %d. Expected %d. Got %d.", + idx, num, got) + } + } +} diff --git a/vendor/github.com/hashicorp/memberlist/keyring.go b/vendor/github.com/hashicorp/memberlist/keyring.go new file mode 100644 index 000000000..a2774a0ce --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/keyring.go @@ -0,0 +1,160 @@ +package memberlist + +import ( + "bytes" + "fmt" + "sync" +) + +type Keyring struct { + // Keys stores the key data used during encryption and decryption. It is + // ordered in such a way where the first key (index 0) is the primary key, + // which is used for encrypting messages, and is the first key tried during + // message decryption. + keys [][]byte + + // The keyring lock is used while performing IO operations on the keyring. + l sync.Mutex +} + +// Init allocates substructures +func (k *Keyring) init() { + k.keys = make([][]byte, 0) +} + +// NewKeyring constructs a new container for a set of encryption keys. The +// keyring contains all key data used internally by memberlist. +// +// While creating a new keyring, you must do one of: +// - Omit keys and primary key, effectively disabling encryption +// - Pass a set of keys plus the primary key +// - Pass only a primary key +// +// If only a primary key is passed, then it will be automatically added to the +// keyring. If creating a keyring with multiple keys, one key must be designated +// primary by passing it as the primaryKey. If the primaryKey does not exist in +// the list of secondary keys, it will be automatically added at position 0. +// +// A key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func NewKeyring(keys [][]byte, primaryKey []byte) (*Keyring, error) { + keyring := &Keyring{} + keyring.init() + + if len(keys) > 0 || len(primaryKey) > 0 { + if len(primaryKey) == 0 { + return nil, fmt.Errorf("Empty primary key not allowed") + } + if err := keyring.AddKey(primaryKey); err != nil { + return nil, err + } + for _, key := range keys { + if err := keyring.AddKey(key); err != nil { + return nil, err + } + } + } + + return keyring, nil +} + +// ValidateKey will check to see if the key is valid and returns an error if not. +// +// key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func ValidateKey(key []byte) error { + if l := len(key); l != 16 && l != 24 && l != 32 { + return fmt.Errorf("key size must be 16, 24 or 32 bytes") + } + return nil +} + +// AddKey will install a new key on the ring. Adding a key to the ring will make +// it available for use in decryption. If the key already exists on the ring, +// this function will just return noop. +// +// key should be either 16, 24, or 32 bytes to select AES-128, +// AES-192, or AES-256. +func (k *Keyring) AddKey(key []byte) error { + if err := ValidateKey(key); err != nil { + return err + } + + // No-op if key is already installed + for _, installedKey := range k.keys { + if bytes.Equal(installedKey, key) { + return nil + } + } + + keys := append(k.keys, key) + primaryKey := k.GetPrimaryKey() + if primaryKey == nil { + primaryKey = key + } + k.installKeys(keys, primaryKey) + return nil +} + +// UseKey changes the key used to encrypt messages. This is the only key used to +// encrypt messages, so peers should know this key before this method is called. +func (k *Keyring) UseKey(key []byte) error { + for _, installedKey := range k.keys { + if bytes.Equal(key, installedKey) { + k.installKeys(k.keys, key) + return nil + } + } + return fmt.Errorf("Requested key is not in the keyring") +} + +// RemoveKey drops a key from the keyring. This will return an error if the key +// requested for removal is currently at position 0 (primary key). +func (k *Keyring) RemoveKey(key []byte) error { + if bytes.Equal(key, k.keys[0]) { + return fmt.Errorf("Removing the primary key is not allowed") + } + for i, installedKey := range k.keys { + if bytes.Equal(key, installedKey) { + keys := append(k.keys[:i], k.keys[i+1:]...) + k.installKeys(keys, k.keys[0]) + } + } + return nil +} + +// installKeys will take out a lock on the keyring, and replace the keys with a +// new set of keys. The key indicated by primaryKey will be installed as the new +// primary key. +func (k *Keyring) installKeys(keys [][]byte, primaryKey []byte) { + k.l.Lock() + defer k.l.Unlock() + + newKeys := [][]byte{primaryKey} + for _, key := range keys { + if !bytes.Equal(key, primaryKey) { + newKeys = append(newKeys, key) + } + } + k.keys = newKeys +} + +// GetKeys returns the current set of keys on the ring. +func (k *Keyring) GetKeys() [][]byte { + k.l.Lock() + defer k.l.Unlock() + + return k.keys +} + +// GetPrimaryKey returns the key on the ring at position 0. This is the key used +// for encrypting messages, and is the first key tried for decrypting messages. +func (k *Keyring) GetPrimaryKey() (key []byte) { + k.l.Lock() + defer k.l.Unlock() + + if len(k.keys) > 0 { + key = k.keys[0] + } + return +} diff --git a/vendor/github.com/hashicorp/memberlist/keyring_test.go b/vendor/github.com/hashicorp/memberlist/keyring_test.go new file mode 100644 index 000000000..eec699fd0 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/keyring_test.go @@ -0,0 +1,154 @@ +package memberlist + +import ( + "bytes" + "testing" +) + +var TestKeys [][]byte = [][]byte{ + []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + []byte{15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, + []byte{8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7}, +} + +func TestKeyring_EmptyRing(t *testing.T) { + // Keyrings can be created with no encryption keys (disabled encryption) + keyring, err := NewKeyring(nil, nil) + if err != nil { + t.Fatalf("err: %s", err) + } + + keys := keyring.GetKeys() + if len(keys) != 0 { + t.Fatalf("Expected 0 keys but have %d", len(keys)) + } +} + +func TestKeyring_PrimaryOnly(t *testing.T) { + // Keyrings can be created using only a primary key + keyring, err := NewKeyring(nil, TestKeys[0]) + if err != nil { + t.Fatalf("err: %s", err) + } + + keys := keyring.GetKeys() + if len(keys) != 1 { + t.Fatalf("Expected 1 key but have %d", len(keys)) + } +} + +func TestKeyring_GetPrimaryKey(t *testing.T) { + keyring, err := NewKeyring(TestKeys, TestKeys[1]) + if err != nil { + t.Fatalf("err: %s", err) + } + + // GetPrimaryKey returns correct key + primaryKey := keyring.GetPrimaryKey() + if !bytes.Equal(primaryKey, TestKeys[1]) { + t.Fatalf("Unexpected primary key: %v", primaryKey) + } +} + +func TestKeyring_AddRemoveUse(t *testing.T) { + keyring, err := NewKeyring(nil, TestKeys[1]) + if err != nil { + t.Fatalf("err :%s", err) + } + + // Use non-existent key throws error + if err := keyring.UseKey(TestKeys[2]); err == nil { + t.Fatalf("Expected key not installed error") + } + + // Add key to ring + if err := keyring.AddKey(TestKeys[2]); err != nil { + t.Fatalf("err: %s", err) + } + + keys := keyring.GetKeys() + if !bytes.Equal(keys[0], TestKeys[1]) { + t.Fatalf("Unexpected primary key change") + } + + if len(keys) != 2 { + t.Fatalf("Expected 2 keys but have %d", len(keys)) + } + + // Use key that exists should succeed + if err := keyring.UseKey(TestKeys[2]); err != nil { + t.Fatalf("err: %s", err) + } + + primaryKey := keyring.GetPrimaryKey() + if !bytes.Equal(primaryKey, TestKeys[2]) { + t.Fatalf("Unexpected primary key: %v", primaryKey) + } + + // Removing primary key should fail + if err := keyring.RemoveKey(TestKeys[2]); err == nil { + t.Fatalf("Expected primary key removal error") + } + + // Removing non-primary key should succeed + if err := keyring.RemoveKey(TestKeys[1]); err != nil { + t.Fatalf("err: %s", err) + } + + keys = keyring.GetKeys() + if len(keys) != 1 { + t.Fatalf("Expected 1 key but have %d", len(keys)) + } +} + +func TestKeyRing_MultiKeyEncryptDecrypt(t *testing.T) { + plaintext := []byte("this is a plain text message") + extra := []byte("random data") + + keyring, err := NewKeyring(TestKeys, TestKeys[0]) + if err != nil { + t.Fatalf("err: %s", err) + } + + // First encrypt using the primary key and make sure we can decrypt + var buf bytes.Buffer + err = encryptPayload(1, TestKeys[0], plaintext, extra, &buf) + if err != nil { + t.Fatalf("err: %v", err) + } + + msg, err := decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) + if err != nil { + t.Fatalf("err: %v", err) + } + + if !bytes.Equal(msg, plaintext) { + t.Fatalf("bad: %v", msg) + } + + // Now encrypt with a secondary key and try decrypting again. + buf.Reset() + err = encryptPayload(1, TestKeys[2], plaintext, extra, &buf) + if err != nil { + t.Fatalf("err: %v", err) + } + + msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) + if err != nil { + t.Fatalf("err: %v", err) + } + + if !bytes.Equal(msg, plaintext) { + t.Fatalf("bad: %v", msg) + } + + // Remove a key from the ring, and then try decrypting again + if err := keyring.RemoveKey(TestKeys[2]); err != nil { + t.Fatalf("err: %s", err) + } + + msg, err = decryptPayload(keyring.GetKeys(), buf.Bytes(), extra) + if err == nil { + t.Fatalf("Expected no keys to decrypt message") + } +} diff --git a/vendor/github.com/hashicorp/memberlist/logging.go b/vendor/github.com/hashicorp/memberlist/logging.go new file mode 100644 index 000000000..f31acfb2f --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/logging.go @@ -0,0 +1,22 @@ +package memberlist + +import ( + "fmt" + "net" +) + +func LogAddress(addr net.Addr) string { + if addr == nil { + return "from=<unknown address>" + } + + return fmt.Sprintf("from=%s", addr.String()) +} + +func LogConn(conn net.Conn) string { + if conn == nil { + return LogAddress(nil) + } + + return LogAddress(conn.RemoteAddr()) +} diff --git a/vendor/github.com/hashicorp/memberlist/logging_test.go b/vendor/github.com/hashicorp/memberlist/logging_test.go new file mode 100644 index 000000000..cc04b8a91 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/logging_test.go @@ -0,0 +1,47 @@ +package memberlist + +import ( + "fmt" + "net" + "testing" +) + +func TestLogging_Address(t *testing.T) { + s := LogAddress(nil) + if s != "from=<unknown address>" { + t.Fatalf("bad: %s", s) + } + + addr, err := net.ResolveIPAddr("ip4", "127.0.0.1") + if err != nil { + t.Fatalf("err: %v", err) + } + + s = LogAddress(addr) + if s != "from=127.0.0.1" { + t.Fatalf("bad: %s", s) + } +} + +func TestLogging_Conn(t *testing.T) { + s := LogConn(nil) + if s != "from=<unknown address>" { + t.Fatalf("bad: %s", s) + } + + ln, err := net.Listen("tcp", ":0") + if err != nil { + t.Fatalf("err: %v", err) + } + + conn, err := net.Dial("tcp", ln.Addr().String()) + if err != nil { + t.Fatalf("err: %v", err) + } + defer conn.Close() + + s = LogConn(conn) + if s != fmt.Sprintf("from=%s", conn.RemoteAddr().String()) { + t.Fatalf("bad: %s", s) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/memberlist.go b/vendor/github.com/hashicorp/memberlist/memberlist.go new file mode 100644 index 000000000..e4b0d7347 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/memberlist.go @@ -0,0 +1,625 @@ +/* +memberlist is a library that manages cluster +membership and member failure detection using a gossip based protocol. + +The use cases for such a library are far-reaching: all distributed systems +require membership, and memberlist is a re-usable solution to managing +cluster membership and node failure detection. + +memberlist is eventually consistent but converges quickly on average. +The speed at which it converges can be heavily tuned via various knobs +on the protocol. Node failures are detected and network partitions are partially +tolerated by attempting to communicate to potentially dead nodes through +multiple routes. +*/ +package memberlist + +import ( + "fmt" + "log" + "net" + "os" + "strconv" + "strings" + "sync" + "time" + + "github.com/hashicorp/go-multierror" + sockaddr "github.com/hashicorp/go-sockaddr" + "github.com/miekg/dns" +) + +type Memberlist struct { + sequenceNum uint32 // Local sequence number + incarnation uint32 // Local incarnation number + numNodes uint32 // Number of known nodes (estimate) + + config *Config + shutdown bool + shutdownCh chan struct{} + leave bool + leaveBroadcast chan struct{} + + transport Transport + handoff chan msgHandoff + + nodeLock sync.RWMutex + nodes []*nodeState // Known nodes + nodeMap map[string]*nodeState // Maps Addr.String() -> NodeState + nodeTimers map[string]*suspicion // Maps Addr.String() -> suspicion timer + awareness *awareness + + tickerLock sync.Mutex + tickers []*time.Ticker + stopTick chan struct{} + probeIndex int + + ackLock sync.Mutex + ackHandlers map[uint32]*ackHandler + + broadcasts *TransmitLimitedQueue + + logger *log.Logger +} + +// newMemberlist creates the network listeners. +// Does not schedule execution of background maintenance. +func newMemberlist(conf *Config) (*Memberlist, error) { + if conf.ProtocolVersion < ProtocolVersionMin { + return nil, fmt.Errorf("Protocol version '%d' too low. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } else if conf.ProtocolVersion > ProtocolVersionMax { + return nil, fmt.Errorf("Protocol version '%d' too high. Must be in range: [%d, %d]", + conf.ProtocolVersion, ProtocolVersionMin, ProtocolVersionMax) + } + + if len(conf.SecretKey) > 0 { + if conf.Keyring == nil { + keyring, err := NewKeyring(nil, conf.SecretKey) + if err != nil { + return nil, err + } + conf.Keyring = keyring + } else { + if err := conf.Keyring.AddKey(conf.SecretKey); err != nil { + return nil, err + } + if err := conf.Keyring.UseKey(conf.SecretKey); err != nil { + return nil, err + } + } + } + + if conf.LogOutput != nil && conf.Logger != nil { + return nil, fmt.Errorf("Cannot specify both LogOutput and Logger. Please choose a single log configuration setting.") + } + + logDest := conf.LogOutput + if logDest == nil { + logDest = os.Stderr + } + + logger := conf.Logger + if logger == nil { + logger = log.New(logDest, "", log.LstdFlags) + } + + // Set up a network transport by default if a custom one wasn't given + // by the config. + transport := conf.Transport + if transport == nil { + nc := &NetTransportConfig{ + BindAddrs: []string{conf.BindAddr}, + BindPort: conf.BindPort, + Logger: logger, + } + nt, err := NewNetTransport(nc) + if err != nil { + return nil, fmt.Errorf("Could not set up network transport: %v", err) + } + + if conf.BindPort == 0 { + port := nt.GetAutoBindPort() + conf.BindPort = port + logger.Printf("[DEBUG] Using dynamic bind port %d", port) + } + transport = nt + } + + m := &Memberlist{ + config: conf, + shutdownCh: make(chan struct{}), + leaveBroadcast: make(chan struct{}, 1), + transport: transport, + handoff: make(chan msgHandoff, conf.HandoffQueueDepth), + nodeMap: make(map[string]*nodeState), + nodeTimers: make(map[string]*suspicion), + awareness: newAwareness(conf.AwarenessMaxMultiplier), + ackHandlers: make(map[uint32]*ackHandler), + broadcasts: &TransmitLimitedQueue{RetransmitMult: conf.RetransmitMult}, + logger: logger, + } + m.broadcasts.NumNodes = func() int { + return m.estNumNodes() + } + go m.streamListen() + go m.packetListen() + go m.packetHandler() + return m, nil +} + +// Create will create a new Memberlist using the given configuration. +// This will not connect to any other node (see Join) yet, but will start +// all the listeners to allow other nodes to join this memberlist. +// After creating a Memberlist, the configuration given should not be +// modified by the user anymore. +func Create(conf *Config) (*Memberlist, error) { + m, err := newMemberlist(conf) + if err != nil { + return nil, err + } + if err := m.setAlive(); err != nil { + m.Shutdown() + return nil, err + } + m.schedule() + return m, nil +} + +// Join is used to take an existing Memberlist and attempt to join a cluster +// by contacting all the given hosts and performing a state sync. Initially, +// the Memberlist only contains our own state, so doing this will cause +// remote nodes to become aware of the existence of this node, effectively +// joining the cluster. +// +// This returns the number of hosts successfully contacted and an error if +// none could be reached. If an error is returned, the node did not successfully +// join the cluster. +func (m *Memberlist) Join(existing []string) (int, error) { + numSuccess := 0 + var errs error + for _, exist := range existing { + addrs, err := m.resolveAddr(exist) + if err != nil { + err = fmt.Errorf("Failed to resolve %s: %v", exist, err) + errs = multierror.Append(errs, err) + m.logger.Printf("[WARN] memberlist: %v", err) + continue + } + + for _, addr := range addrs { + hp := joinHostPort(addr.ip.String(), addr.port) + if err := m.pushPullNode(hp, true); err != nil { + err = fmt.Errorf("Failed to join %s: %v", addr.ip, err) + errs = multierror.Append(errs, err) + m.logger.Printf("[DEBUG] memberlist: %v", err) + continue + } + numSuccess++ + } + + } + if numSuccess > 0 { + errs = nil + } + return numSuccess, errs +} + +// ipPort holds information about a node we want to try to join. +type ipPort struct { + ip net.IP + port uint16 +} + +// tcpLookupIP is a helper to initiate a TCP-based DNS lookup for the given host. +// The built-in Go resolver will do a UDP lookup first, and will only use TCP if +// the response has the truncate bit set, which isn't common on DNS servers like +// Consul's. By doing the TCP lookup directly, we get the best chance for the +// largest list of hosts to join. Since joins are relatively rare events, it's ok +// to do this rather expensive operation. +func (m *Memberlist) tcpLookupIP(host string, defaultPort uint16) ([]ipPort, error) { + // Don't attempt any TCP lookups against non-fully qualified domain + // names, since those will likely come from the resolv.conf file. + if !strings.Contains(host, ".") { + return nil, nil + } + + // Make sure the domain name is terminated with a dot (we know there's + // at least one character at this point). + dn := host + if dn[len(dn)-1] != '.' { + dn = dn + "." + } + + // See if we can find a server to try. + cc, err := dns.ClientConfigFromFile(m.config.DNSConfigPath) + if err != nil { + return nil, err + } + if len(cc.Servers) > 0 { + // We support host:port in the DNS config, but need to add the + // default port if one is not supplied. + server := cc.Servers[0] + if !hasPort(server) { + server = net.JoinHostPort(server, cc.Port) + } + + // Do the lookup. + c := new(dns.Client) + c.Net = "tcp" + msg := new(dns.Msg) + msg.SetQuestion(dn, dns.TypeANY) + in, _, err := c.Exchange(msg, server) + if err != nil { + return nil, err + } + + // Handle any IPs we get back that we can attempt to join. + var ips []ipPort + for _, r := range in.Answer { + switch rr := r.(type) { + case (*dns.A): + ips = append(ips, ipPort{rr.A, defaultPort}) + case (*dns.AAAA): + ips = append(ips, ipPort{rr.AAAA, defaultPort}) + case (*dns.CNAME): + m.logger.Printf("[DEBUG] memberlist: Ignoring CNAME RR in TCP-first answer for '%s'", host) + } + } + return ips, nil + } + + return nil, nil +} + +// resolveAddr is used to resolve the address into an address, +// port, and error. If no port is given, use the default +func (m *Memberlist) resolveAddr(hostStr string) ([]ipPort, error) { + // Normalize the incoming string to host:port so we can apply Go's + // parser to it. + port := uint16(0) + if !hasPort(hostStr) { + hostStr += ":" + strconv.Itoa(m.config.BindPort) + } + host, sport, err := net.SplitHostPort(hostStr) + if err != nil { + return nil, err + } + + // This will capture the supplied port, or the default one added above. + lport, err := strconv.ParseUint(sport, 10, 16) + if err != nil { + return nil, err + } + port = uint16(lport) + + // If it looks like an IP address we are done. The SplitHostPort() above + // will make sure the host part is in good shape for parsing, even for + // IPv6 addresses. + if ip := net.ParseIP(host); ip != nil { + return []ipPort{ipPort{ip, port}}, nil + } + + // First try TCP so we have the best chance for the largest list of + // hosts to join. If this fails it's not fatal since this isn't a standard + // way to query DNS, and we have a fallback below. + ips, err := m.tcpLookupIP(host, port) + if err != nil { + m.logger.Printf("[DEBUG] memberlist: TCP-first lookup failed for '%s', falling back to UDP: %s", hostStr, err) + } + if len(ips) > 0 { + return ips, nil + } + + // If TCP didn't yield anything then use the normal Go resolver which + // will try UDP, then might possibly try TCP again if the UDP response + // indicates it was truncated. + ans, err := net.LookupIP(host) + if err != nil { + return nil, err + } + ips = make([]ipPort, 0, len(ans)) + for _, ip := range ans { + ips = append(ips, ipPort{ip, port}) + } + return ips, nil +} + +// setAlive is used to mark this node as being alive. This is the same +// as if we received an alive notification our own network channel for +// ourself. +func (m *Memberlist) setAlive() error { + // Get the final advertise address from the transport, which may need + // to see which address we bound to. + addr, port, err := m.transport.FinalAdvertiseAddr( + m.config.AdvertiseAddr, m.config.AdvertisePort) + if err != nil { + return fmt.Errorf("Failed to get final advertise address: %v", err) + } + + // Check if this is a public address without encryption + ipAddr, err := sockaddr.NewIPAddr(addr.String()) + if err != nil { + return fmt.Errorf("Failed to parse interface addresses: %v", err) + } + ifAddrs := []sockaddr.IfAddr{ + sockaddr.IfAddr{ + SockAddr: ipAddr, + }, + } + _, publicIfs, err := sockaddr.IfByRFC("6890", ifAddrs) + if len(publicIfs) > 0 && !m.config.EncryptionEnabled() { + m.logger.Printf("[WARN] memberlist: Binding to public address without encryption!") + } + + // Set any metadata from the delegate. + var meta []byte + if m.config.Delegate != nil { + meta = m.config.Delegate.NodeMeta(MetaMaxSize) + if len(meta) > MetaMaxSize { + panic("Node meta data provided is longer than the limit") + } + } + + a := alive{ + Incarnation: m.nextIncarnation(), + Node: m.config.Name, + Addr: addr, + Port: uint16(port), + Meta: meta, + Vsn: []uint8{ + ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, + m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, + m.config.DelegateProtocolVersion, + }, + } + m.aliveNode(&a, nil, true) + return nil +} + +// LocalNode is used to return the local Node +func (m *Memberlist) LocalNode() *Node { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + state := m.nodeMap[m.config.Name] + return &state.Node +} + +// UpdateNode is used to trigger re-advertising the local node. This is +// primarily used with a Delegate to support dynamic updates to the local +// meta data. This will block until the update message is successfully +// broadcasted to a member of the cluster, if any exist or until a specified +// timeout is reached. +func (m *Memberlist) UpdateNode(timeout time.Duration) error { + // Get the node meta data + var meta []byte + if m.config.Delegate != nil { + meta = m.config.Delegate.NodeMeta(MetaMaxSize) + if len(meta) > MetaMaxSize { + panic("Node meta data provided is longer than the limit") + } + } + + // Get the existing node + m.nodeLock.RLock() + state := m.nodeMap[m.config.Name] + m.nodeLock.RUnlock() + + // Format a new alive message + a := alive{ + Incarnation: m.nextIncarnation(), + Node: m.config.Name, + Addr: state.Addr, + Port: state.Port, + Meta: meta, + Vsn: []uint8{ + ProtocolVersionMin, ProtocolVersionMax, m.config.ProtocolVersion, + m.config.DelegateProtocolMin, m.config.DelegateProtocolMax, + m.config.DelegateProtocolVersion, + }, + } + notifyCh := make(chan struct{}) + m.aliveNode(&a, notifyCh, true) + + // Wait for the broadcast or a timeout + if m.anyAlive() { + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + select { + case <-notifyCh: + case <-timeoutCh: + return fmt.Errorf("timeout waiting for update broadcast") + } + } + return nil +} + +// SendTo is deprecated in favor of SendBestEffort, which requires a node to +// target. +func (m *Memberlist) SendTo(to net.Addr, msg []byte) error { + // Encode as a user message + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + + // Send the message + return m.rawSendMsgPacket(to.String(), nil, buf) +} + +// SendToUDP is deprecated in favor of SendBestEffort. +func (m *Memberlist) SendToUDP(to *Node, msg []byte) error { + return m.SendBestEffort(to, msg) +} + +// SendToTCP is deprecated in favor of SendReliable. +func (m *Memberlist) SendToTCP(to *Node, msg []byte) error { + return m.SendReliable(to, msg) +} + +// SendBestEffort uses the unreliable packet-oriented interface of the transport +// to target a user message at the given node (this does not use the gossip +// mechanism). The maximum size of the message depends on the configured +// UDPBufferSize for this memberlist instance. +func (m *Memberlist) SendBestEffort(to *Node, msg []byte) error { + // Encode as a user message + buf := make([]byte, 1, len(msg)+1) + buf[0] = byte(userMsg) + buf = append(buf, msg...) + + // Send the message + return m.rawSendMsgPacket(to.Address(), to, buf) +} + +// SendReliable uses the reliable stream-oriented interface of the transport to +// target a user message at the given node (this does not use the gossip +// mechanism). Delivery is guaranteed if no error is returned, and there is no +// limit on the size of the message. +func (m *Memberlist) SendReliable(to *Node, msg []byte) error { + return m.sendUserMsg(to.Address(), msg) +} + +// Members returns a list of all known live nodes. The node structures +// returned must not be modified. If you wish to modify a Node, make a +// copy first. +func (m *Memberlist) Members() []*Node { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + nodes := make([]*Node, 0, len(m.nodes)) + for _, n := range m.nodes { + if n.State != stateDead { + nodes = append(nodes, &n.Node) + } + } + + return nodes +} + +// NumMembers returns the number of alive nodes currently known. Between +// the time of calling this and calling Members, the number of alive nodes +// may have changed, so this shouldn't be used to determine how many +// members will be returned by Members. +func (m *Memberlist) NumMembers() (alive int) { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + for _, n := range m.nodes { + if n.State != stateDead { + alive++ + } + } + + return +} + +// Leave will broadcast a leave message but will not shutdown the background +// listeners, meaning the node will continue participating in gossip and state +// updates. +// +// This will block until the leave message is successfully broadcasted to +// a member of the cluster, if any exist or until a specified timeout +// is reached. +// +// This method is safe to call multiple times, but must not be called +// after the cluster is already shut down. +func (m *Memberlist) Leave(timeout time.Duration) error { + m.nodeLock.Lock() + // We can't defer m.nodeLock.Unlock() because m.deadNode will also try to + // acquire a lock so we need to Unlock before that. + + if m.shutdown { + m.nodeLock.Unlock() + panic("leave after shutdown") + } + + if !m.leave { + m.leave = true + + state, ok := m.nodeMap[m.config.Name] + m.nodeLock.Unlock() + if !ok { + m.logger.Printf("[WARN] memberlist: Leave but we're not in the node map.") + return nil + } + + d := dead{ + Incarnation: state.Incarnation, + Node: state.Name, + } + m.deadNode(&d) + + // Block until the broadcast goes out + if m.anyAlive() { + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + select { + case <-m.leaveBroadcast: + case <-timeoutCh: + return fmt.Errorf("timeout waiting for leave broadcast") + } + } + } else { + m.nodeLock.Unlock() + } + + return nil +} + +// Check for any other alive node. +func (m *Memberlist) anyAlive() bool { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + for _, n := range m.nodes { + if n.State != stateDead && n.Name != m.config.Name { + return true + } + } + return false +} + +// GetHealthScore gives this instance's idea of how well it is meeting the soft +// real-time requirements of the protocol. Lower numbers are better, and zero +// means "totally healthy". +func (m *Memberlist) GetHealthScore() int { + return m.awareness.GetHealthScore() +} + +// ProtocolVersion returns the protocol version currently in use by +// this memberlist. +func (m *Memberlist) ProtocolVersion() uint8 { + // NOTE: This method exists so that in the future we can control + // any locking if necessary, if we change the protocol version at + // runtime, etc. + return m.config.ProtocolVersion +} + +// Shutdown will stop any background maintanence of network activity +// for this memberlist, causing it to appear "dead". A leave message +// will not be broadcasted prior, so the cluster being left will have +// to detect this node's shutdown using probing. If you wish to more +// gracefully exit the cluster, call Leave prior to shutting down. +// +// This method is safe to call multiple times. +func (m *Memberlist) Shutdown() error { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + if m.shutdown { + return nil + } + + // Shut down the transport first, which should block until it's + // completely torn down. If we kill the memberlist-side handlers + // those I/O handlers might get stuck. + m.transport.Shutdown() + + // Now tear down everything else. + m.shutdown = true + close(m.shutdownCh) + m.deschedule() + return nil +} diff --git a/vendor/github.com/hashicorp/memberlist/memberlist_test.go b/vendor/github.com/hashicorp/memberlist/memberlist_test.go new file mode 100644 index 000000000..ff03ab3e4 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/memberlist_test.go @@ -0,0 +1,1416 @@ +package memberlist + +import ( + "bytes" + "fmt" + "io/ioutil" + "log" + "net" + "os" + "reflect" + "strings" + "sync" + "testing" + "time" + + "github.com/miekg/dns" +) + +var bindLock sync.Mutex +var bindNum byte = 10 + +func getBindAddr() net.IP { + bindLock.Lock() + defer bindLock.Unlock() + + result := net.IPv4(127, 0, 0, bindNum) + bindNum++ + if bindNum > 255 { + bindNum = 10 + } + + return result +} + +func testConfig() *Config { + config := DefaultLANConfig() + config.BindAddr = getBindAddr().String() + config.Name = config.BindAddr + return config +} + +func yield() { + time.Sleep(5 * time.Millisecond) +} + +type MockDelegate struct { + meta []byte + msgs [][]byte + broadcasts [][]byte + state []byte + remoteState []byte +} + +func (m *MockDelegate) NodeMeta(limit int) []byte { + return m.meta +} + +func (m *MockDelegate) NotifyMsg(msg []byte) { + cp := make([]byte, len(msg)) + copy(cp, msg) + m.msgs = append(m.msgs, cp) +} + +func (m *MockDelegate) GetBroadcasts(overhead, limit int) [][]byte { + b := m.broadcasts + m.broadcasts = nil + return b +} + +func (m *MockDelegate) LocalState(join bool) []byte { + return m.state +} + +func (m *MockDelegate) MergeRemoteState(s []byte, join bool) { + m.remoteState = s +} + +// Returns a new Memberlist on an open port by trying a range of port numbers +// until something sticks. +func NewMemberlistOnOpenPort(c *Config) (*Memberlist, error) { + c.BindPort = 0 + return newMemberlist(c) +} + +func GetMemberlistDelegate(t *testing.T) (*Memberlist, *MockDelegate) { + d := &MockDelegate{} + + c := testConfig() + c.Delegate = d + + m, err := NewMemberlistOnOpenPort(c) + if err != nil { + t.Fatalf("failed to start: %v", err) + return nil, nil + } + + return m, d +} + +func GetMemberlist(t *testing.T) *Memberlist { + c := testConfig() + + m, err := NewMemberlistOnOpenPort(c) + if err != nil { + t.Fatalf("failed to start: %v", err) + return nil + } + + return m +} + +func TestDefaultLANConfig_protocolVersion(t *testing.T) { + c := DefaultLANConfig() + if c.ProtocolVersion != ProtocolVersion2Compatible { + t.Fatalf("should be max: %d", c.ProtocolVersion) + } +} + +func TestCreate_protocolVersion(t *testing.T) { + cases := []struct { + version uint8 + err bool + }{ + {ProtocolVersionMin, false}, + {ProtocolVersionMax, false}, + // TODO(mitchellh): uncommon when we're over 0 + //{ProtocolVersionMin - 1, true}, + {ProtocolVersionMax + 1, true}, + {ProtocolVersionMax - 1, false}, + } + + for _, tc := range cases { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + c.ProtocolVersion = tc.version + m, err := Create(c) + if tc.err && err == nil { + t.Errorf("Should've failed with version: %d", tc.version) + } else if !tc.err && err != nil { + t.Errorf("Version '%d' error: %s", tc.version, err) + } + + if err == nil { + m.Shutdown() + } + } +} + +func TestCreate_secretKey(t *testing.T) { + cases := []struct { + key []byte + err bool + }{ + {make([]byte, 0), false}, + {[]byte("abc"), true}, + {make([]byte, 16), false}, + {make([]byte, 38), true}, + } + + for _, tc := range cases { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + c.SecretKey = tc.key + m, err := Create(c) + if tc.err && err == nil { + t.Errorf("Should've failed with key: %#v", tc.key) + } else if !tc.err && err != nil { + t.Errorf("Key '%#v' error: %s", tc.key, err) + } + + if err == nil { + m.Shutdown() + } + } +} + +func TestCreate_secretKeyEmpty(t *testing.T) { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + c.SecretKey = make([]byte, 0) + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + if m.config.EncryptionEnabled() { + t.Fatalf("Expected encryption to be disabled") + } +} + +func TestCreate_keyringOnly(t *testing.T) { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + keyring, err := NewKeyring(nil, make([]byte, 16)) + if err != nil { + t.Fatalf("err: %s", err) + } + c.Keyring = keyring + + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + if !m.config.EncryptionEnabled() { + t.Fatalf("Expected encryption to be enabled") + } +} + +func TestCreate_keyringAndSecretKey(t *testing.T) { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + keyring, err := NewKeyring(nil, make([]byte, 16)) + if err != nil { + t.Fatalf("err: %s", err) + } + c.Keyring = keyring + c.SecretKey = []byte{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1} + + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + if !m.config.EncryptionEnabled() { + t.Fatalf("Expected encryption to be enabled") + } + + ringKeys := c.Keyring.GetKeys() + if !bytes.Equal(c.SecretKey, ringKeys[0]) { + t.Fatalf("Unexpected primary key %v", ringKeys[0]) + } +} + +func TestCreate_invalidLoggerSettings(t *testing.T) { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + c.Logger = log.New(ioutil.Discard, "", log.LstdFlags) + c.LogOutput = ioutil.Discard + + _, err := Create(c) + if err == nil { + t.Fatal("Memberlist should not allow both LogOutput and Logger to be set, but it did not raise an error") + } +} + +func TestCreate(t *testing.T) { + c := testConfig() + c.ProtocolVersion = ProtocolVersionMin + c.DelegateProtocolVersion = 13 + c.DelegateProtocolMin = 12 + c.DelegateProtocolMax = 24 + + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + yield() + + members := m.Members() + if len(members) != 1 { + t.Fatalf("bad number of members") + } + + if members[0].PMin != ProtocolVersionMin { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].PMax != ProtocolVersionMax { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].PCur != c.ProtocolVersion { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].DMin != c.DelegateProtocolMin { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].DMax != c.DelegateProtocolMax { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].DCur != c.DelegateProtocolVersion { + t.Fatalf("bad: %#v", members[0]) + } +} + +func TestMemberList_CreateShutdown(t *testing.T) { + m := GetMemberlist(t) + m.schedule() + if err := m.Shutdown(); err != nil { + t.Fatalf("failed to shutdown %v", err) + } +} + +func TestMemberList_ResolveAddr(t *testing.T) { + m := GetMemberlist(t) + if _, err := m.resolveAddr("localhost"); err != nil { + t.Fatalf("Could not resolve localhost: %s", err) + } + if _, err := m.resolveAddr("[::1]:80"); err != nil { + t.Fatalf("Could not understand ipv6 pair: %s", err) + } + if _, err := m.resolveAddr("[::1]"); err != nil { + t.Fatalf("Could not understand ipv6 non-pair") + } + if _, err := m.resolveAddr(":80"); err == nil { + t.Fatalf("Understood hostless port") + } + if _, err := m.resolveAddr("localhost:80"); err != nil { + t.Fatalf("Could not understand hostname port combo: %s", err) + } + if _, err := m.resolveAddr("localhost:80000"); err == nil { + t.Fatalf("Understood too high port") + } + if _, err := m.resolveAddr("127.0.0.1:80"); err != nil { + t.Fatalf("Could not understand hostname port combo: %s", err) + } + if _, err := m.resolveAddr("[2001:db8:a0b:12f0::1]:80"); err != nil { + t.Fatalf("Could not understand hostname port combo: %s", err) + } +} + +type dnsHandler struct { + t *testing.T +} + +func (h dnsHandler) ServeDNS(w dns.ResponseWriter, r *dns.Msg) { + if len(r.Question) != 1 { + h.t.Fatalf("bad: %#v", r.Question) + } + + name := "join.service.consul." + question := r.Question[0] + if question.Name != name || question.Qtype != dns.TypeANY { + h.t.Fatalf("bad: %#v", question) + } + + m := new(dns.Msg) + m.SetReply(r) + m.Authoritative = true + m.RecursionAvailable = false + m.Answer = append(m.Answer, &dns.A{ + Hdr: dns.RR_Header{ + Name: name, + Rrtype: dns.TypeA, + Class: dns.ClassINET}, + A: net.ParseIP("127.0.0.1"), + }) + m.Answer = append(m.Answer, &dns.AAAA{ + Hdr: dns.RR_Header{ + Name: name, + Rrtype: dns.TypeAAAA, + Class: dns.ClassINET}, + AAAA: net.ParseIP("2001:db8:a0b:12f0::1"), + }) + if err := w.WriteMsg(m); err != nil { + h.t.Fatalf("err: %v", err) + } +} + +func TestMemberList_ResolveAddr_TCP_First(t *testing.T) { + bind := "127.0.0.1:8600" + + var wg sync.WaitGroup + wg.Add(1) + server := &dns.Server{ + Addr: bind, + Handler: dnsHandler{t}, + Net: "tcp", + NotifyStartedFunc: wg.Done, + } + defer server.Shutdown() + + go func() { + if err := server.ListenAndServe(); err != nil && !strings.Contains(err.Error(), "use of closed network connection") { + t.Fatalf("err: %v", err) + } + }() + wg.Wait() + + tmpFile, err := ioutil.TempFile("", "") + if err != nil { + t.Fatalf("err: %v", err) + } + defer os.Remove(tmpFile.Name()) + + content := []byte(fmt.Sprintf("nameserver %s", bind)) + if _, err := tmpFile.Write(content); err != nil { + t.Fatalf("err: %v", err) + } + if err := tmpFile.Close(); err != nil { + t.Fatalf("err: %v", err) + } + + m := GetMemberlist(t) + m.config.DNSConfigPath = tmpFile.Name() + m.setAlive() + m.schedule() + defer m.Shutdown() + + // Try with and without the trailing dot. + hosts := []string{ + "join.service.consul.", + "join.service.consul", + } + for _, host := range hosts { + ips, err := m.resolveAddr(host) + if err != nil { + t.Fatalf("err: %v", err) + } + port := uint16(m.config.BindPort) + expected := []ipPort{ + ipPort{net.ParseIP("127.0.0.1"), port}, + ipPort{net.ParseIP("2001:db8:a0b:12f0::1"), port}, + } + if !reflect.DeepEqual(ips, expected) { + t.Fatalf("bad: %#v", ips) + } + } +} + +func TestMemberList_Members(t *testing.T) { + n1 := &Node{Name: "test"} + n2 := &Node{Name: "test2"} + n3 := &Node{Name: "test3"} + + m := &Memberlist{} + nodes := []*nodeState{ + &nodeState{Node: *n1, State: stateAlive}, + &nodeState{Node: *n2, State: stateDead}, + &nodeState{Node: *n3, State: stateSuspect}, + } + m.nodes = nodes + + members := m.Members() + if !reflect.DeepEqual(members, []*Node{n1, n3}) { + t.Fatalf("bad members") + } +} + +func TestMemberlist_Join(t *testing.T) { + m1 := GetMemberlist(t) + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + if m2.estNumNodes() != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } +} + +type CustomMergeDelegate struct { + invoked bool +} + +func (c *CustomMergeDelegate) NotifyMerge(nodes []*Node) error { + log.Printf("Cancel merge") + c.invoked = true + return fmt.Errorf("Custom merge canceled") +} + +func TestMemberlist_Join_Cancel(t *testing.T) { + m1 := GetMemberlist(t) + merge1 := &CustomMergeDelegate{} + m1.config.Merge = merge1 + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + merge2 := &CustomMergeDelegate{} + m2.config.Merge = merge2 + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 0 { + t.Fatalf("unexpected 0: %d", num) + } + if !strings.Contains(err.Error(), "Custom merge canceled") { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 1 { + t.Fatalf("should have 1 nodes! %v", m2.Members()) + } + if len(m1.Members()) != 1 { + t.Fatalf("should have 1 nodes! %v", m1.Members()) + } + + // Check delegate invocation + if !merge1.invoked { + t.Fatalf("should invoke delegate") + } + if !merge2.invoked { + t.Fatalf("should invoke delegate") + } +} + +type CustomAliveDelegate struct { + Ignore string + count int +} + +func (c *CustomAliveDelegate) NotifyAlive(peer *Node) error { + c.count++ + if peer.Name == c.Ignore { + return nil + } + log.Printf("Cancel alive") + return fmt.Errorf("Custom alive canceled") +} + +func TestMemberlist_Join_Cancel_Passive(t *testing.T) { + m1 := GetMemberlist(t) + alive1 := &CustomAliveDelegate{ + Ignore: m1.config.Name, + } + m1.config.Alive = alive1 + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + alive2 := &CustomAliveDelegate{ + Ignore: c.Name, + } + m2.config.Alive = alive2 + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 1 { + t.Fatalf("should have 1 nodes! %v", m2.Members()) + } + if len(m1.Members()) != 1 { + t.Fatalf("should have 1 nodes! %v", m1.Members()) + } + + // Check delegate invocation + if alive1.count == 0 { + t.Fatalf("should invoke delegate: %d", alive1.count) + } + if alive2.count == 0 { + t.Fatalf("should invoke delegate: %d", alive2.count) + } +} + +func TestMemberlist_Join_protocolVersions(t *testing.T) { + c1 := testConfig() + c2 := testConfig() + c3 := testConfig() + c3.ProtocolVersion = ProtocolVersionMax + + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m1.Shutdown() + + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m2.Shutdown() + + m3, err := Create(c3) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m3.Shutdown() + + _, err = m1.Join([]string{c2.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } + + yield() + + _, err = m1.Join([]string{c3.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } +} + +func TestMemberlist_Leave(t *testing.T) { + m1 := GetMemberlist(t) + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + c.GossipInterval = time.Millisecond + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + if len(m1.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + // Leave + m1.Leave(time.Second) + + // Wait for leave + time.Sleep(10 * time.Millisecond) + + // m1 should think dead + if len(m1.Members()) != 1 { + t.Fatalf("should have 1 node") + } + + if len(m2.Members()) != 1 { + t.Fatalf("should have 1 node") + } +} + +func TestMemberlist_JoinShutdown(t *testing.T) { + m1 := GetMemberlist(t) + m1.setAlive() + m1.schedule() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + c.ProbeInterval = time.Millisecond + c.ProbeTimeout = 100 * time.Microsecond + c.SuspicionMaxTimeoutMult = 1 + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + m1.Shutdown() + + time.Sleep(10 * time.Millisecond) + + if len(m2.Members()) != 1 { + t.Fatalf("should have 1 nodes! %v", m2.Members()) + } +} + +func TestMemberlist_delegateMeta(t *testing.T) { + c1 := testConfig() + c2 := testConfig() + c1.Delegate = &MockDelegate{meta: []byte("web")} + c2.Delegate = &MockDelegate{meta: []byte("lb")} + + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m1.Shutdown() + + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m2.Shutdown() + + _, err = m1.Join([]string{c2.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } + + yield() + + var roles map[string]string + + // Check the roles of members of m1 + m1m := m1.Members() + if len(m1m) != 2 { + t.Fatalf("bad: %#v", m1m) + } + + roles = make(map[string]string) + for _, m := range m1m { + roles[m.Name] = string(m.Meta) + } + + if r := roles[c1.Name]; r != "web" { + t.Fatalf("bad role for %s: %s", c1.Name, r) + } + + if r := roles[c2.Name]; r != "lb" { + t.Fatalf("bad role for %s: %s", c2.Name, r) + } + + // Check the roles of members of m2 + m2m := m2.Members() + if len(m2m) != 2 { + t.Fatalf("bad: %#v", m2m) + } + + roles = make(map[string]string) + for _, m := range m2m { + roles[m.Name] = string(m.Meta) + } + + if r := roles[c1.Name]; r != "web" { + t.Fatalf("bad role for %s: %s", c1.Name, r) + } + + if r := roles[c2.Name]; r != "lb" { + t.Fatalf("bad role for %s: %s", c2.Name, r) + } +} + +func TestMemberlist_delegateMeta_Update(t *testing.T) { + c1 := testConfig() + c2 := testConfig() + mock1 := &MockDelegate{meta: []byte("web")} + mock2 := &MockDelegate{meta: []byte("lb")} + c1.Delegate = mock1 + c2.Delegate = mock2 + + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m1.Shutdown() + + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m2.Shutdown() + + _, err = m1.Join([]string{c2.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } + + yield() + + // Update the meta data roles + mock1.meta = []byte("api") + mock2.meta = []byte("db") + + m1.UpdateNode(0) + m2.UpdateNode(0) + yield() + + // Check the updates have propagated + var roles map[string]string + + // Check the roles of members of m1 + m1m := m1.Members() + if len(m1m) != 2 { + t.Fatalf("bad: %#v", m1m) + } + + roles = make(map[string]string) + for _, m := range m1m { + roles[m.Name] = string(m.Meta) + } + + if r := roles[c1.Name]; r != "api" { + t.Fatalf("bad role for %s: %s", c1.Name, r) + } + + if r := roles[c2.Name]; r != "db" { + t.Fatalf("bad role for %s: %s", c2.Name, r) + } + + // Check the roles of members of m2 + m2m := m2.Members() + if len(m2m) != 2 { + t.Fatalf("bad: %#v", m2m) + } + + roles = make(map[string]string) + for _, m := range m2m { + roles[m.Name] = string(m.Meta) + } + + if r := roles[c1.Name]; r != "api" { + t.Fatalf("bad role for %s: %s", c1.Name, r) + } + + if r := roles[c2.Name]; r != "db" { + t.Fatalf("bad role for %s: %s", c2.Name, r) + } +} + +func TestMemberlist_UserData(t *testing.T) { + m1, d1 := GetMemberlistDelegate(t) + d1.state = []byte("something") + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second delegate with things to send + d2 := &MockDelegate{} + d2.broadcasts = [][]byte{ + []byte("test"), + []byte("foobar"), + } + d2.state = []byte("my state") + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + c.GossipInterval = time.Millisecond + c.PushPullInterval = time.Millisecond + c.Delegate = d2 + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + // Check the hosts + if m2.NumMembers() != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + // Wait for a little while + time.Sleep(3 * time.Millisecond) + + // Ensure we got the messages + if len(d1.msgs) != 2 { + t.Fatalf("should have 2 messages!") + } + if !reflect.DeepEqual(d1.msgs[0], []byte("test")) { + t.Fatalf("bad msg %v", d1.msgs[0]) + } + if !reflect.DeepEqual(d1.msgs[1], []byte("foobar")) { + t.Fatalf("bad msg %v", d1.msgs[1]) + } + + // Check the push/pull state + if !reflect.DeepEqual(d1.remoteState, []byte("my state")) { + t.Fatalf("bad state %s", d1.remoteState) + } + if !reflect.DeepEqual(d2.remoteState, []byte("something")) { + t.Fatalf("bad state %s", d2.remoteState) + } +} + +func TestMemberlist_SendTo(t *testing.T) { + m1, d1 := GetMemberlistDelegate(t) + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second delegate with things to send + d2 := &MockDelegate{} + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + c.GossipInterval = time.Millisecond + c.PushPullInterval = time.Millisecond + c.Delegate = d2 + + m2, err := Create(c) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if m2.NumMembers() != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + // Try to do a direct send + m2Addr := &net.UDPAddr{IP: addr1, + Port: c.BindPort} + if err := m1.SendTo(m2Addr, []byte("ping")); err != nil { + t.Fatalf("err: %v", err) + } + + m1Addr := &net.UDPAddr{IP: net.ParseIP(m1.config.BindAddr), + Port: m1.config.BindPort} + if err := m2.SendTo(m1Addr, []byte("pong")); err != nil { + t.Fatalf("err: %v", err) + } + + // Wait for a little while + time.Sleep(3 * time.Millisecond) + + // Ensure we got the messages + if len(d1.msgs) != 1 { + t.Fatalf("should have 1 messages!") + } + if !reflect.DeepEqual(d1.msgs[0], []byte("pong")) { + t.Fatalf("bad msg %v", d1.msgs[0]) + } + + if len(d2.msgs) != 1 { + t.Fatalf("should have 1 messages!") + } + if !reflect.DeepEqual(d2.msgs[0], []byte("ping")) { + t.Fatalf("bad msg %v", d2.msgs[0]) + } +} + +func TestMemberlistProtocolVersion(t *testing.T) { + c := DefaultLANConfig() + c.BindAddr = getBindAddr().String() + c.ProtocolVersion = ProtocolVersionMax + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + result := m.ProtocolVersion() + if result != ProtocolVersionMax { + t.Fatalf("bad: %d", result) + } +} + +func TestMemberlist_Join_DeadNode(t *testing.T) { + m1 := GetMemberlist(t) + m1.config.TCPTimeout = 50 * time.Millisecond + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second "node", which is just a TCP listener that + // does not ever respond. This is to test our deadliens + addr1 := getBindAddr() + list, err := net.Listen("tcp", fmt.Sprintf("%s:%d", addr1.String(), m1.config.BindPort)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer list.Close() + + // Ensure we don't hang forever + timer := time.AfterFunc(100*time.Millisecond, func() { + panic("should have timed out by now") + }) + defer timer.Stop() + + num, err := m1.Join([]string{addr1.String()}) + if num != 0 { + t.Fatalf("unexpected 0: %d", num) + } + if err == nil { + t.Fatal("expect err") + } +} + +// Tests that nodes running different versions of the protocol can successfully +// discover each other and add themselves to their respective member lists. +func TestMemberlist_Join_Prototocol_Compatibility(t *testing.T) { + testProtocolVersionPair := func(t *testing.T, pv1 uint8, pv2 uint8) { + c1 := testConfig() + c1.ProtocolVersion = pv1 + m1, err := NewMemberlistOnOpenPort(c1) + if err != nil { + t.Fatalf("failed to start: %v", err) + } + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + c2 := DefaultLANConfig() + addr1 := getBindAddr() + c2.Name = addr1.String() + c2.BindAddr = addr1.String() + c2.BindPort = m1.config.BindPort + c2.ProtocolVersion = pv2 + + m2, err := Create(c2) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{m1.config.BindAddr}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + // Check the hosts + if len(m1.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m1.Members()) + } + } + + testProtocolVersionPair(t, 2, 1) + testProtocolVersionPair(t, 2, 3) + testProtocolVersionPair(t, 3, 2) + testProtocolVersionPair(t, 3, 1) +} + +func TestMemberlist_Join_IPv6(t *testing.T) { + // Since this binds to all interfaces we need to exclude other tests + // from grabbing an interface. + bindLock.Lock() + defer bindLock.Unlock() + + c1 := DefaultLANConfig() + c1.Name = "A" + c1.BindAddr = "[::1]" + var m1 *Memberlist + var err error + for i := 0; i < 100; i++ { + c1.BindPort = 23456 + i + m1, err = Create(c1) + if err == nil { + break + } + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m1.Shutdown() + + // Create a second node + c2 := DefaultLANConfig() + c2.Name = "B" + c2.BindAddr = "[::1]" + var m2 *Memberlist + for i := 0; i < 100; i++ { + c2.BindPort = c1.BindPort + 1 + i + m2, err = Create(c2) + if err == nil { + break + } + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + defer m2.Shutdown() + + num, err := m2.Join([]string{fmt.Sprintf("%s:%d", m1.config.BindAddr, 23456)}) + if num != 1 { + t.Fatalf("unexpected 1: %d", num) + } + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + // Check the hosts + if len(m2.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } + + if len(m1.Members()) != 2 { + t.Fatalf("should have 2 nodes! %v", m2.Members()) + } +} + +func TestAdvertiseAddr(t *testing.T) { + c := testConfig() + c.AdvertiseAddr = "127.0.1.100" + c.AdvertisePort = 23456 + + m, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + yield() + + members := m.Members() + if len(members) != 1 { + t.Fatalf("bad number of members") + } + + if bytes.Compare(members[0].Addr, []byte{127, 0, 1, 100}) != 0 { + t.Fatalf("bad: %#v", members[0]) + } + + if members[0].Port != 23456 { + t.Fatalf("bad: %#v", members[0]) + } +} + +type MockConflict struct { + existing *Node + other *Node +} + +func (m *MockConflict) NotifyConflict(existing, other *Node) { + m.existing = existing + m.other = other +} + +func TestMemberlist_conflictDelegate(t *testing.T) { + c1 := testConfig() + c2 := testConfig() + mock := &MockConflict{} + c1.Conflict = mock + + // Ensure name conflict + c2.Name = c1.Name + + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m1.Shutdown() + + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m2.Shutdown() + + _, err = m1.Join([]string{c2.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } + + yield() + + // Ensure we were notified + if mock.existing == nil || mock.other == nil { + t.Fatalf("should get notified") + } + if mock.existing.Name != mock.other.Name { + t.Fatalf("bad: %v %v", mock.existing, mock.other) + } +} + +type MockPing struct { + other *Node + rtt time.Duration + payload []byte +} + +func (m *MockPing) NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) { + m.other = other + m.rtt = rtt + m.payload = payload +} + +const DEFAULT_PAYLOAD = "whatever" + +func (m *MockPing) AckPayload() []byte { + return []byte(DEFAULT_PAYLOAD) +} + +func TestMemberlist_PingDelegate(t *testing.T) { + m1 := GetMemberlist(t) + m1.config.Ping = &MockPing{} + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + // Create a second node + c := DefaultLANConfig() + addr1 := getBindAddr() + c.Name = addr1.String() + c.BindAddr = addr1.String() + c.BindPort = m1.config.BindPort + c.ProbeInterval = time.Millisecond + mock := &MockPing{} + c.Ping = mock + + m2, err := Create(c) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m2.Shutdown() + + _, err = m2.Join([]string{m1.config.BindAddr}) + if err != nil { + t.Fatalf("err: %s", err) + } + + yield() + + // Ensure we were notified + if mock.other == nil { + t.Fatalf("should get notified") + } + + if !reflect.DeepEqual(mock.other, m1.LocalNode()) { + t.Fatalf("not notified about the correct node; expected: %+v; actual: %+v", + m2.LocalNode(), mock.other) + } + + if mock.rtt <= 0 { + t.Fatalf("rtt should be greater than 0") + } + + if bytes.Compare(mock.payload, []byte(DEFAULT_PAYLOAD)) != 0 { + t.Fatalf("incorrect payload. expected: %v; actual: %v", []byte(DEFAULT_PAYLOAD), mock.payload) + } +} + +// Consul bug, rapid restart (before failure detection), +// with an updated meta data. Should be at incarnation 1 for +// both. +// +// This test is uncommented because it requires that either we +// can rebind the socket (SO_REUSEPORT) which Go does not allow, +// OR we must disable the address conflict checking in memberlist. +// I just comment out that code to test this case. +// +//func TestMemberlist_Restart_delegateMeta_Update(t *testing.T) { +// c1 := testConfig() +// c2 := testConfig() +// mock1 := &MockDelegate{meta: []byte("web")} +// mock2 := &MockDelegate{meta: []byte("lb")} +// c1.Delegate = mock1 +// c2.Delegate = mock2 + +// m1, err := Create(c1) +// if err != nil { +// t.Fatalf("err: %s", err) +// } +// defer m1.Shutdown() + +// m2, err := Create(c2) +// if err != nil { +// t.Fatalf("err: %s", err) +// } +// defer m2.Shutdown() + +// _, err = m1.Join([]string{c2.BindAddr}) +// if err != nil { +// t.Fatalf("err: %s", err) +// } + +// yield() + +// // Recreate m1 with updated meta +// m1.Shutdown() +// c3 := testConfig() +// c3.Name = c1.Name +// c3.Delegate = mock1 +// c3.GossipInterval = time.Millisecond +// mock1.meta = []byte("api") + +// m1, err = Create(c3) +// if err != nil { +// t.Fatalf("err: %s", err) +// } +// defer m1.Shutdown() + +// _, err = m1.Join([]string{c2.BindAddr}) +// if err != nil { +// t.Fatalf("err: %s", err) +// } + +// yield() +// yield() + +// // Check the updates have propagated +// var roles map[string]string + +// // Check the roles of members of m1 +// m1m := m1.Members() +// if len(m1m) != 2 { +// t.Fatalf("bad: %#v", m1m) +// } + +// roles = make(map[string]string) +// for _, m := range m1m { +// roles[m.Name] = string(m.Meta) +// } + +// if r := roles[c1.Name]; r != "api" { +// t.Fatalf("bad role for %s: %s", c1.Name, r) +// } + +// if r := roles[c2.Name]; r != "lb" { +// t.Fatalf("bad role for %s: %s", c2.Name, r) +// } + +// // Check the roles of members of m2 +// m2m := m2.Members() +// if len(m2m) != 2 { +// t.Fatalf("bad: %#v", m2m) +// } + +// roles = make(map[string]string) +// for _, m := range m2m { +// roles[m.Name] = string(m.Meta) +// } + +// if r := roles[c1.Name]; r != "api" { +// t.Fatalf("bad role for %s: %s", c1.Name, r) +// } + +// if r := roles[c2.Name]; r != "lb" { +// t.Fatalf("bad role for %s: %s", c2.Name, r) +// } +//} diff --git a/vendor/github.com/hashicorp/memberlist/merge_delegate.go b/vendor/github.com/hashicorp/memberlist/merge_delegate.go new file mode 100644 index 000000000..89afb59f2 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/merge_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +// MergeDelegate is used to involve a client in +// a potential cluster merge operation. Namely, when +// a node does a TCP push/pull (as part of a join), +// the delegate is involved and allowed to cancel the join +// based on custom logic. The merge delegate is NOT invoked +// as part of the push-pull anti-entropy. +type MergeDelegate interface { + // NotifyMerge is invoked when a merge could take place. + // Provides a list of the nodes known by the peer. If + // the return value is non-nil, the merge is canceled. + NotifyMerge(peers []*Node) error +} diff --git a/vendor/github.com/hashicorp/memberlist/mock_transport.go b/vendor/github.com/hashicorp/memberlist/mock_transport.go new file mode 100644 index 000000000..b8bafa802 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/mock_transport.go @@ -0,0 +1,121 @@ +package memberlist + +import ( + "fmt" + "net" + "strconv" + "time" +) + +// MockNetwork is used as a factory that produces MockTransport instances which +// are uniquely addressed and wired up to talk to each other. +type MockNetwork struct { + transports map[string]*MockTransport + port int +} + +// NewTransport returns a new MockTransport with a unique address, wired up to +// talk to the other transports in the MockNetwork. +func (n *MockNetwork) NewTransport() *MockTransport { + n.port += 1 + addr := fmt.Sprintf("127.0.0.1:%d", n.port) + transport := &MockTransport{ + net: n, + addr: &MockAddress{addr}, + packetCh: make(chan *Packet), + streamCh: make(chan net.Conn), + } + + if n.transports == nil { + n.transports = make(map[string]*MockTransport) + } + n.transports[addr] = transport + return transport +} + +// MockAddress is a wrapper which adds the net.Addr interface to our mock +// address scheme. +type MockAddress struct { + addr string +} + +// See net.Addr. +func (a *MockAddress) Network() string { + return "mock" +} + +// See net.Addr. +func (a *MockAddress) String() string { + return a.addr +} + +// MockTransport directly plumbs messages to other transports its MockNetwork. +type MockTransport struct { + net *MockNetwork + addr *MockAddress + packetCh chan *Packet + streamCh chan net.Conn +} + +// See Transport. +func (t *MockTransport) FinalAdvertiseAddr(string, int) (net.IP, int, error) { + host, portStr, err := net.SplitHostPort(t.addr.String()) + if err != nil { + return nil, 0, err + } + + ip := net.ParseIP(host) + if ip == nil { + return nil, 0, fmt.Errorf("Failed to parse IP %q", host) + } + + port, err := strconv.ParseInt(portStr, 10, 16) + if err != nil { + return nil, 0, err + } + + return ip, int(port), nil +} + +// See Transport. +func (t *MockTransport) WriteTo(b []byte, addr string) (time.Time, error) { + dest, ok := t.net.transports[addr] + if !ok { + return time.Time{}, fmt.Errorf("No route to %q", addr) + } + + now := time.Now() + dest.packetCh <- &Packet{ + Buf: b, + From: t.addr, + Timestamp: now, + } + return now, nil +} + +// See Transport. +func (t *MockTransport) PacketCh() <-chan *Packet { + return t.packetCh +} + +// See Transport. +func (t *MockTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { + dest, ok := t.net.transports[addr] + if !ok { + return nil, fmt.Errorf("No route to %q", addr) + } + + p1, p2 := net.Pipe() + dest.streamCh <- p1 + return p2, nil +} + +// See Transport. +func (t *MockTransport) StreamCh() <-chan net.Conn { + return t.streamCh +} + +// See Transport. +func (t *MockTransport) Shutdown() error { + return nil +} diff --git a/vendor/github.com/hashicorp/memberlist/net.go b/vendor/github.com/hashicorp/memberlist/net.go new file mode 100644 index 000000000..e0036d01d --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/net.go @@ -0,0 +1,1069 @@ +package memberlist + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "net" + "time" + + "github.com/armon/go-metrics" + "github.com/hashicorp/go-msgpack/codec" +) + +// This is the minimum and maximum protocol version that we can +// _understand_. We're allowed to speak at any version within this +// range. This range is inclusive. +const ( + ProtocolVersionMin uint8 = 1 + + // Version 3 added support for TCP pings but we kept the default + // protocol version at 2 to ease transition to this new feature. + // A memberlist speaking version 2 of the protocol will attempt + // to TCP ping another memberlist who understands version 3 or + // greater. + // + // Version 4 added support for nacks as part of indirect probes. + // A memberlist speaking version 2 of the protocol will expect + // nacks from another memberlist who understands version 4 or + // greater, and likewise nacks will be sent to memberlists who + // understand version 4 or greater. + ProtocolVersion2Compatible = 2 + + ProtocolVersionMax = 5 +) + +// messageType is an integer ID of a type of message that can be received +// on network channels from other members. +type messageType uint8 + +// The list of available message types. +const ( + pingMsg messageType = iota + indirectPingMsg + ackRespMsg + suspectMsg + aliveMsg + deadMsg + pushPullMsg + compoundMsg + userMsg // User mesg, not handled by us + compressMsg + encryptMsg + nackRespMsg + hasCrcMsg +) + +// compressionType is used to specify the compression algorithm +type compressionType uint8 + +const ( + lzwAlgo compressionType = iota +) + +const ( + MetaMaxSize = 512 // Maximum size for node meta data + compoundHeaderOverhead = 2 // Assumed header overhead + compoundOverhead = 2 // Assumed overhead per entry in compoundHeader + userMsgOverhead = 1 + blockingWarning = 10 * time.Millisecond // Warn if a UDP packet takes this long to process + maxPushStateBytes = 10 * 1024 * 1024 +) + +// ping request sent directly to node +type ping struct { + SeqNo uint32 + + // Node is sent so the target can verify they are + // the intended recipient. This is to protect again an agent + // restart with a new name. + Node string +} + +// indirect ping sent to an indirect ndoe +type indirectPingReq struct { + SeqNo uint32 + Target []byte + Port uint16 + Node string + Nack bool // true if we'd like a nack back +} + +// ack response is sent for a ping +type ackResp struct { + SeqNo uint32 + Payload []byte +} + +// nack response is sent for an indirect ping when the pinger doesn't hear from +// the ping-ee within the configured timeout. This lets the original node know +// that the indirect ping attempt happened but didn't succeed. +type nackResp struct { + SeqNo uint32 +} + +// suspect is broadcast when we suspect a node is dead +type suspect struct { + Incarnation uint32 + Node string + From string // Include who is suspecting +} + +// alive is broadcast when we know a node is alive. +// Overloaded for nodes joining +type alive struct { + Incarnation uint32 + Node string + Addr []byte + Port uint16 + Meta []byte + + // The versions of the protocol/delegate that are being spoken, order: + // pmin, pmax, pcur, dmin, dmax, dcur + Vsn []uint8 +} + +// dead is broadcast when we confirm a node is dead +// Overloaded for nodes leaving +type dead struct { + Incarnation uint32 + Node string + From string // Include who is suspecting +} + +// pushPullHeader is used to inform the +// otherside how many states we are transferring +type pushPullHeader struct { + Nodes int + UserStateLen int // Encodes the byte lengh of user state + Join bool // Is this a join request or a anti-entropy run +} + +// userMsgHeader is used to encapsulate a userMsg +type userMsgHeader struct { + UserMsgLen int // Encodes the byte lengh of user state +} + +// pushNodeState is used for pushPullReq when we are +// transferring out node states +type pushNodeState struct { + Name string + Addr []byte + Port uint16 + Meta []byte + Incarnation uint32 + State nodeStateType + Vsn []uint8 // Protocol versions +} + +// compress is used to wrap an underlying payload +// using a specified compression algorithm +type compress struct { + Algo compressionType + Buf []byte +} + +// msgHandoff is used to transfer a message between goroutines +type msgHandoff struct { + msgType messageType + buf []byte + from net.Addr +} + +// encryptionVersion returns the encryption version to use +func (m *Memberlist) encryptionVersion() encryptionVersion { + switch m.ProtocolVersion() { + case 1: + return 0 + default: + return 1 + } +} + +// streamListen is a long running goroutine that pulls incoming streams from the +// transport and hands them off for processing. +func (m *Memberlist) streamListen() { + for { + select { + case conn := <-m.transport.StreamCh(): + go m.handleConn(conn) + + case <-m.shutdownCh: + return + } + } +} + +// handleConn handles a single incoming stream connection from the transport. +func (m *Memberlist) handleConn(conn net.Conn) { + m.logger.Printf("[DEBUG] memberlist: Stream connection %s", LogConn(conn)) + + defer conn.Close() + metrics.IncrCounter([]string{"memberlist", "tcp", "accept"}, 1) + + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + msgType, bufConn, dec, err := m.readStream(conn) + if err != nil { + if err != io.EOF { + m.logger.Printf("[ERR] memberlist: failed to receive: %s %s", err, LogConn(conn)) + } + return + } + + switch msgType { + case userMsg: + if err := m.readUserMsg(bufConn, dec); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to receive user message: %s %s", err, LogConn(conn)) + } + case pushPullMsg: + join, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to read remote state: %s %s", err, LogConn(conn)) + return + } + + if err := m.sendLocalState(conn, join); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to push local state: %s %s", err, LogConn(conn)) + return + } + + if err := m.mergeRemoteState(join, remoteNodes, userState); err != nil { + m.logger.Printf("[ERR] memberlist: Failed push/pull merge: %s %s", err, LogConn(conn)) + return + } + case pingMsg: + var p ping + if err := dec.Decode(&p); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode ping: %s %s", err, LogConn(conn)) + return + } + + if p.Node != "" && p.Node != m.config.Name { + m.logger.Printf("[WARN] memberlist: Got ping for unexpected node %s %s", p.Node, LogConn(conn)) + return + } + + ack := ackResp{p.SeqNo, nil} + out, err := encode(ackRespMsg, &ack) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode ack: %s", err) + return + } + + err = m.rawSendMsgStream(conn, out.Bytes()) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogConn(conn)) + return + } + default: + m.logger.Printf("[ERR] memberlist: Received invalid msgType (%d) %s", msgType, LogConn(conn)) + } +} + +// packetListen is a long running goroutine that pulls packets out of the +// transport and hands them off for processing. +func (m *Memberlist) packetListen() { + for { + select { + case packet := <-m.transport.PacketCh(): + m.ingestPacket(packet.Buf, packet.From, packet.Timestamp) + + case <-m.shutdownCh: + return + } + } +} + +func (m *Memberlist) ingestPacket(buf []byte, from net.Addr, timestamp time.Time) { + // Check if encryption is enabled + if m.config.EncryptionEnabled() { + // Decrypt the payload + plain, err := decryptPayload(m.config.Keyring.GetKeys(), buf, nil) + if err != nil { + m.logger.Printf("[ERR] memberlist: Decrypt packet failed: %v %s", err, LogAddress(from)) + return + } + + // Continue processing the plaintext buffer + buf = plain + } + + // See if there's a checksum included to verify the contents of the message + if len(buf) >= 5 && messageType(buf[0]) == hasCrcMsg { + crc := crc32.ChecksumIEEE(buf[5:]) + expected := binary.BigEndian.Uint32(buf[1:5]) + if crc != expected { + m.logger.Printf("[WARN] memberlist: Got invalid checksum for UDP packet: %x, %x", crc, expected) + return + } + m.handleCommand(buf[5:], from, timestamp) + } else { + m.handleCommand(buf, from, timestamp) + } +} + +func (m *Memberlist) handleCommand(buf []byte, from net.Addr, timestamp time.Time) { + // Decode the message type + msgType := messageType(buf[0]) + buf = buf[1:] + + // Switch on the msgType + switch msgType { + case compoundMsg: + m.handleCompound(buf, from, timestamp) + case compressMsg: + m.handleCompressed(buf, from, timestamp) + + case pingMsg: + m.handlePing(buf, from) + case indirectPingMsg: + m.handleIndirectPing(buf, from) + case ackRespMsg: + m.handleAck(buf, from, timestamp) + case nackRespMsg: + m.handleNack(buf, from) + + case suspectMsg: + fallthrough + case aliveMsg: + fallthrough + case deadMsg: + fallthrough + case userMsg: + select { + case m.handoff <- msgHandoff{msgType, buf, from}: + default: + m.logger.Printf("[WARN] memberlist: handler queue full, dropping message (%d) %s", msgType, LogAddress(from)) + } + + default: + m.logger.Printf("[ERR] memberlist: msg type (%d) not supported %s", msgType, LogAddress(from)) + } +} + +// packetHandler is a long running goroutine that processes messages received +// over the packet interface, but is decoupled from the listener to avoid +// blocking the listener which may cause ping/ack messages to be delayed. +func (m *Memberlist) packetHandler() { + for { + select { + case msg := <-m.handoff: + msgType := msg.msgType + buf := msg.buf + from := msg.from + + switch msgType { + case suspectMsg: + m.handleSuspect(buf, from) + case aliveMsg: + m.handleAlive(buf, from) + case deadMsg: + m.handleDead(buf, from) + case userMsg: + m.handleUser(buf, from) + default: + m.logger.Printf("[ERR] memberlist: Message type (%d) not supported %s (packet handler)", msgType, LogAddress(from)) + } + + case <-m.shutdownCh: + return + } + } +} + +func (m *Memberlist) handleCompound(buf []byte, from net.Addr, timestamp time.Time) { + // Decode the parts + trunc, parts, err := decodeCompoundMessage(buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode compound request: %s %s", err, LogAddress(from)) + return + } + + // Log any truncation + if trunc > 0 { + m.logger.Printf("[WARN] memberlist: Compound request had %d truncated messages %s", trunc, LogAddress(from)) + } + + // Handle each message + for _, part := range parts { + m.handleCommand(part, from, timestamp) + } +} + +func (m *Memberlist) handlePing(buf []byte, from net.Addr) { + var p ping + if err := decode(buf, &p); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode ping request: %s %s", err, LogAddress(from)) + return + } + // If node is provided, verify that it is for us + if p.Node != "" && p.Node != m.config.Name { + m.logger.Printf("[WARN] memberlist: Got ping for unexpected node '%s' %s", p.Node, LogAddress(from)) + return + } + var ack ackResp + ack.SeqNo = p.SeqNo + if m.config.Ping != nil { + ack.Payload = m.config.Ping.AckPayload() + } + if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ack: %s %s", err, LogAddress(from)) + } +} + +func (m *Memberlist) handleIndirectPing(buf []byte, from net.Addr) { + var ind indirectPingReq + if err := decode(buf, &ind); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode indirect ping request: %s %s", err, LogAddress(from)) + return + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port. + if m.ProtocolVersion() < 2 || ind.Port == 0 { + ind.Port = uint16(m.config.BindPort) + } + + // Send a ping to the correct host. + localSeqNo := m.nextSeqNo() + ping := ping{SeqNo: localSeqNo, Node: ind.Node} + + // Setup a response handler to relay the ack + cancelCh := make(chan struct{}) + respHandler := func(payload []byte, timestamp time.Time) { + // Try to prevent the nack if we've caught it in time. + close(cancelCh) + + // Forward the ack back to the requestor. + ack := ackResp{ind.SeqNo, nil} + if err := m.encodeAndSendMsg(from.String(), ackRespMsg, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to forward ack: %s %s", err, LogAddress(from)) + } + } + m.setAckHandler(localSeqNo, respHandler, m.config.ProbeTimeout) + + // Send the ping. + addr := joinHostPort(net.IP(ind.Target).String(), ind.Port) + if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ping: %s %s", err, LogAddress(from)) + } + + // Setup a timer to fire off a nack if no ack is seen in time. + if ind.Nack { + go func() { + select { + case <-cancelCh: + return + case <-time.After(m.config.ProbeTimeout): + nack := nackResp{ind.SeqNo} + if err := m.encodeAndSendMsg(from.String(), nackRespMsg, &nack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send nack: %s %s", err, LogAddress(from)) + } + } + }() + } +} + +func (m *Memberlist) handleAck(buf []byte, from net.Addr, timestamp time.Time) { + var ack ackResp + if err := decode(buf, &ack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode ack response: %s %s", err, LogAddress(from)) + return + } + m.invokeAckHandler(ack, timestamp) +} + +func (m *Memberlist) handleNack(buf []byte, from net.Addr) { + var nack nackResp + if err := decode(buf, &nack); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode nack response: %s %s", err, LogAddress(from)) + return + } + m.invokeNackHandler(nack) +} + +func (m *Memberlist) handleSuspect(buf []byte, from net.Addr) { + var sus suspect + if err := decode(buf, &sus); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode suspect message: %s %s", err, LogAddress(from)) + return + } + m.suspectNode(&sus) +} + +func (m *Memberlist) handleAlive(buf []byte, from net.Addr) { + var live alive + if err := decode(buf, &live); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode alive message: %s %s", err, LogAddress(from)) + return + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port + if m.ProtocolVersion() < 2 || live.Port == 0 { + live.Port = uint16(m.config.BindPort) + } + + m.aliveNode(&live, nil, false) +} + +func (m *Memberlist) handleDead(buf []byte, from net.Addr) { + var d dead + if err := decode(buf, &d); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decode dead message: %s %s", err, LogAddress(from)) + return + } + m.deadNode(&d) +} + +// handleUser is used to notify channels of incoming user data +func (m *Memberlist) handleUser(buf []byte, from net.Addr) { + d := m.config.Delegate + if d != nil { + d.NotifyMsg(buf) + } +} + +// handleCompressed is used to unpack a compressed message +func (m *Memberlist) handleCompressed(buf []byte, from net.Addr, timestamp time.Time) { + // Try to decode the payload + payload, err := decompressPayload(buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to decompress payload: %v %s", err, LogAddress(from)) + return + } + + // Recursively handle the payload + m.handleCommand(payload, from, timestamp) +} + +// encodeAndSendMsg is used to combine the encoding and sending steps +func (m *Memberlist) encodeAndSendMsg(addr string, msgType messageType, msg interface{}) error { + out, err := encode(msgType, msg) + if err != nil { + return err + } + if err := m.sendMsg(addr, out.Bytes()); err != nil { + return err + } + return nil +} + +// sendMsg is used to send a message via packet to another host. It will +// opportunistically create a compoundMsg and piggy back other broadcasts. +func (m *Memberlist) sendMsg(addr string, msg []byte) error { + // Check if we can piggy back any messages + bytesAvail := m.config.UDPBufferSize - len(msg) - compoundHeaderOverhead + if m.config.EncryptionEnabled() { + bytesAvail -= encryptOverhead(m.encryptionVersion()) + } + extra := m.getBroadcasts(compoundOverhead, bytesAvail) + + // Fast path if nothing to piggypack + if len(extra) == 0 { + return m.rawSendMsgPacket(addr, nil, msg) + } + + // Join all the messages + msgs := make([][]byte, 0, 1+len(extra)) + msgs = append(msgs, msg) + msgs = append(msgs, extra...) + + // Create a compound message + compound := makeCompoundMessage(msgs) + + // Send the message + return m.rawSendMsgPacket(addr, nil, compound.Bytes()) +} + +// rawSendMsgPacket is used to send message via packet to another host without +// modification, other than compression or encryption if enabled. +func (m *Memberlist) rawSendMsgPacket(addr string, node *Node, msg []byte) error { + // Check if we have compression enabled + if m.config.EnableCompression { + buf, err := compressPayload(msg) + if err != nil { + m.logger.Printf("[WARN] memberlist: Failed to compress payload: %v", err) + } else { + // Only use compression if it reduced the size + if buf.Len() < len(msg) { + msg = buf.Bytes() + } + } + } + + // Try to look up the destination node + if node == nil { + toAddr, _, err := net.SplitHostPort(addr) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed to parse address %q: %v", addr, err) + return err + } + m.nodeLock.RLock() + nodeState, ok := m.nodeMap[toAddr] + m.nodeLock.RUnlock() + if ok { + node = &nodeState.Node + } + } + + // Add a CRC to the end of the payload if the recipient understands + // ProtocolVersion >= 5 + if node != nil && node.PMax >= 5 { + crc := crc32.ChecksumIEEE(msg) + header := make([]byte, 5, 5+len(msg)) + header[0] = byte(hasCrcMsg) + binary.BigEndian.PutUint32(header[1:], crc) + msg = append(header, msg...) + } + + // Check if we have encryption enabled + if m.config.EncryptionEnabled() { + // Encrypt the payload + var buf bytes.Buffer + primaryKey := m.config.Keyring.GetPrimaryKey() + err := encryptPayload(m.encryptionVersion(), primaryKey, msg, nil, &buf) + if err != nil { + m.logger.Printf("[ERR] memberlist: Encryption of message failed: %v", err) + return err + } + msg = buf.Bytes() + } + + metrics.IncrCounter([]string{"memberlist", "udp", "sent"}, float32(len(msg))) + _, err := m.transport.WriteTo(msg, addr) + return err +} + +// rawSendMsgStream is used to stream a message to another host without +// modification, other than applying compression and encryption if enabled. +func (m *Memberlist) rawSendMsgStream(conn net.Conn, sendBuf []byte) error { + // Check if compresion is enabled + if m.config.EnableCompression { + compBuf, err := compressPayload(sendBuf) + if err != nil { + m.logger.Printf("[ERROR] memberlist: Failed to compress payload: %v", err) + } else { + sendBuf = compBuf.Bytes() + } + } + + // Check if encryption is enabled + if m.config.EncryptionEnabled() { + crypt, err := m.encryptLocalState(sendBuf) + if err != nil { + m.logger.Printf("[ERROR] memberlist: Failed to encrypt local state: %v", err) + return err + } + sendBuf = crypt + } + + // Write out the entire send buffer + metrics.IncrCounter([]string{"memberlist", "tcp", "sent"}, float32(len(sendBuf))) + + if n, err := conn.Write(sendBuf); err != nil { + return err + } else if n != len(sendBuf) { + return fmt.Errorf("only %d of %d bytes written", n, len(sendBuf)) + } + + return nil +} + +// sendUserMsg is used to stream a user message to another host. +func (m *Memberlist) sendUserMsg(addr string, sendBuf []byte) error { + conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) + if err != nil { + return err + } + defer conn.Close() + + bufConn := bytes.NewBuffer(nil) + if err := bufConn.WriteByte(byte(userMsg)); err != nil { + return err + } + + header := userMsgHeader{UserMsgLen: len(sendBuf)} + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(bufConn, &hd) + if err := enc.Encode(&header); err != nil { + return err + } + if _, err := bufConn.Write(sendBuf); err != nil { + return err + } + return m.rawSendMsgStream(conn, bufConn.Bytes()) +} + +// sendAndReceiveState is used to initiate a push/pull over a stream with a +// remote host. +func (m *Memberlist) sendAndReceiveState(addr string, join bool) ([]pushNodeState, []byte, error) { + // Attempt to connect + conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) + if err != nil { + return nil, nil, err + } + defer conn.Close() + m.logger.Printf("[DEBUG] memberlist: Initiating push/pull sync with: %s", conn.RemoteAddr()) + metrics.IncrCounter([]string{"memberlist", "tcp", "connect"}, 1) + + // Send our state + if err := m.sendLocalState(conn, join); err != nil { + return nil, nil, err + } + + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + msgType, bufConn, dec, err := m.readStream(conn) + if err != nil { + return nil, nil, err + } + + // Quit if not push/pull + if msgType != pushPullMsg { + err := fmt.Errorf("received invalid msgType (%d), expected pushPullMsg (%d) %s", msgType, pushPullMsg, LogConn(conn)) + return nil, nil, err + } + + // Read remote state + _, remoteNodes, userState, err := m.readRemoteState(bufConn, dec) + return remoteNodes, userState, err +} + +// sendLocalState is invoked to send our local state over a stream connection. +func (m *Memberlist) sendLocalState(conn net.Conn, join bool) error { + // Setup a deadline + conn.SetDeadline(time.Now().Add(m.config.TCPTimeout)) + + // Prepare the local node state + m.nodeLock.RLock() + localNodes := make([]pushNodeState, len(m.nodes)) + for idx, n := range m.nodes { + localNodes[idx].Name = n.Name + localNodes[idx].Addr = n.Addr + localNodes[idx].Port = n.Port + localNodes[idx].Incarnation = n.Incarnation + localNodes[idx].State = n.State + localNodes[idx].Meta = n.Meta + localNodes[idx].Vsn = []uint8{ + n.PMin, n.PMax, n.PCur, + n.DMin, n.DMax, n.DCur, + } + } + m.nodeLock.RUnlock() + + // Get the delegate state + var userData []byte + if m.config.Delegate != nil { + userData = m.config.Delegate.LocalState(join) + } + + // Create a bytes buffer writer + bufConn := bytes.NewBuffer(nil) + + // Send our node state + header := pushPullHeader{Nodes: len(localNodes), UserStateLen: len(userData), Join: join} + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(bufConn, &hd) + + // Begin state push + if _, err := bufConn.Write([]byte{byte(pushPullMsg)}); err != nil { + return err + } + + if err := enc.Encode(&header); err != nil { + return err + } + for i := 0; i < header.Nodes; i++ { + if err := enc.Encode(&localNodes[i]); err != nil { + return err + } + } + + // Write the user state as well + if userData != nil { + if _, err := bufConn.Write(userData); err != nil { + return err + } + } + + // Get the send buffer + return m.rawSendMsgStream(conn, bufConn.Bytes()) +} + +// encryptLocalState is used to help encrypt local state before sending +func (m *Memberlist) encryptLocalState(sendBuf []byte) ([]byte, error) { + var buf bytes.Buffer + + // Write the encryptMsg byte + buf.WriteByte(byte(encryptMsg)) + + // Write the size of the message + sizeBuf := make([]byte, 4) + encVsn := m.encryptionVersion() + encLen := encryptedLength(encVsn, len(sendBuf)) + binary.BigEndian.PutUint32(sizeBuf, uint32(encLen)) + buf.Write(sizeBuf) + + // Write the encrypted cipher text to the buffer + key := m.config.Keyring.GetPrimaryKey() + err := encryptPayload(encVsn, key, sendBuf, buf.Bytes()[:5], &buf) + if err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// decryptRemoteState is used to help decrypt the remote state +func (m *Memberlist) decryptRemoteState(bufConn io.Reader) ([]byte, error) { + // Read in enough to determine message length + cipherText := bytes.NewBuffer(nil) + cipherText.WriteByte(byte(encryptMsg)) + _, err := io.CopyN(cipherText, bufConn, 4) + if err != nil { + return nil, err + } + + // Ensure we aren't asked to download too much. This is to guard against + // an attack vector where a huge amount of state is sent + moreBytes := binary.BigEndian.Uint32(cipherText.Bytes()[1:5]) + if moreBytes > maxPushStateBytes { + return nil, fmt.Errorf("Remote node state is larger than limit (%d)", moreBytes) + } + + // Read in the rest of the payload + _, err = io.CopyN(cipherText, bufConn, int64(moreBytes)) + if err != nil { + return nil, err + } + + // Decrypt the cipherText + dataBytes := cipherText.Bytes()[:5] + cipherBytes := cipherText.Bytes()[5:] + + // Decrypt the payload + keys := m.config.Keyring.GetKeys() + return decryptPayload(keys, cipherBytes, dataBytes) +} + +// readStream is used to read from a stream connection, decrypting and +// decompressing the stream if necessary. +func (m *Memberlist) readStream(conn net.Conn) (messageType, io.Reader, *codec.Decoder, error) { + // Created a buffered reader + var bufConn io.Reader = bufio.NewReader(conn) + + // Read the message type + buf := [1]byte{0} + if _, err := bufConn.Read(buf[:]); err != nil { + return 0, nil, nil, err + } + msgType := messageType(buf[0]) + + // Check if the message is encrypted + if msgType == encryptMsg { + if !m.config.EncryptionEnabled() { + return 0, nil, nil, + fmt.Errorf("Remote state is encrypted and encryption is not configured") + } + + plain, err := m.decryptRemoteState(bufConn) + if err != nil { + return 0, nil, nil, err + } + + // Reset message type and bufConn + msgType = messageType(plain[0]) + bufConn = bytes.NewReader(plain[1:]) + } else if m.config.EncryptionEnabled() { + return 0, nil, nil, + fmt.Errorf("Encryption is configured but remote state is not encrypted") + } + + // Get the msgPack decoders + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(bufConn, &hd) + + // Check if we have a compressed message + if msgType == compressMsg { + var c compress + if err := dec.Decode(&c); err != nil { + return 0, nil, nil, err + } + decomp, err := decompressBuffer(&c) + if err != nil { + return 0, nil, nil, err + } + + // Reset the message type + msgType = messageType(decomp[0]) + + // Create a new bufConn + bufConn = bytes.NewReader(decomp[1:]) + + // Create a new decoder + dec = codec.NewDecoder(bufConn, &hd) + } + + return msgType, bufConn, dec, nil +} + +// readRemoteState is used to read the remote state from a connection +func (m *Memberlist) readRemoteState(bufConn io.Reader, dec *codec.Decoder) (bool, []pushNodeState, []byte, error) { + // Read the push/pull header + var header pushPullHeader + if err := dec.Decode(&header); err != nil { + return false, nil, nil, err + } + + // Allocate space for the transfer + remoteNodes := make([]pushNodeState, header.Nodes) + + // Try to decode all the states + for i := 0; i < header.Nodes; i++ { + if err := dec.Decode(&remoteNodes[i]); err != nil { + return false, nil, nil, err + } + } + + // Read the remote user state into a buffer + var userBuf []byte + if header.UserStateLen > 0 { + userBuf = make([]byte, header.UserStateLen) + bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserStateLen) + if err == nil && bytes != header.UserStateLen { + err = fmt.Errorf( + "Failed to read full user state (%d / %d)", + bytes, header.UserStateLen) + } + if err != nil { + return false, nil, nil, err + } + } + + // For proto versions < 2, there is no port provided. Mask old + // behavior by using the configured port + for idx := range remoteNodes { + if m.ProtocolVersion() < 2 || remoteNodes[idx].Port == 0 { + remoteNodes[idx].Port = uint16(m.config.BindPort) + } + } + + return header.Join, remoteNodes, userBuf, nil +} + +// mergeRemoteState is used to merge the remote state with our local state +func (m *Memberlist) mergeRemoteState(join bool, remoteNodes []pushNodeState, userBuf []byte) error { + if err := m.verifyProtocol(remoteNodes); err != nil { + return err + } + + // Invoke the merge delegate if any + if join && m.config.Merge != nil { + nodes := make([]*Node, len(remoteNodes)) + for idx, n := range remoteNodes { + nodes[idx] = &Node{ + Name: n.Name, + Addr: n.Addr, + Port: n.Port, + Meta: n.Meta, + PMin: n.Vsn[0], + PMax: n.Vsn[1], + PCur: n.Vsn[2], + DMin: n.Vsn[3], + DMax: n.Vsn[4], + DCur: n.Vsn[5], + } + } + if err := m.config.Merge.NotifyMerge(nodes); err != nil { + return err + } + } + + // Merge the membership state + m.mergeState(remoteNodes) + + // Invoke the delegate for user state + if userBuf != nil && m.config.Delegate != nil { + m.config.Delegate.MergeRemoteState(userBuf, join) + } + return nil +} + +// readUserMsg is used to decode a userMsg from a stream. +func (m *Memberlist) readUserMsg(bufConn io.Reader, dec *codec.Decoder) error { + // Read the user message header + var header userMsgHeader + if err := dec.Decode(&header); err != nil { + return err + } + + // Read the user message into a buffer + var userBuf []byte + if header.UserMsgLen > 0 { + userBuf = make([]byte, header.UserMsgLen) + bytes, err := io.ReadAtLeast(bufConn, userBuf, header.UserMsgLen) + if err == nil && bytes != header.UserMsgLen { + err = fmt.Errorf( + "Failed to read full user message (%d / %d)", + bytes, header.UserMsgLen) + } + if err != nil { + return err + } + + d := m.config.Delegate + if d != nil { + d.NotifyMsg(userBuf) + } + } + + return nil +} + +// sendPingAndWaitForAck makes a stream connection to the given address, sends +// a ping, and waits for an ack. All of this is done as a series of blocking +// operations, given the deadline. The bool return parameter is true if we +// we able to round trip a ping to the other node. +func (m *Memberlist) sendPingAndWaitForAck(addr string, ping ping, deadline time.Time) (bool, error) { + conn, err := m.transport.DialTimeout(addr, m.config.TCPTimeout) + if err != nil { + // If the node is actually dead we expect this to fail, so we + // shouldn't spam the logs with it. After this point, errors + // with the connection are real, unexpected errors and should + // get propagated up. + return false, nil + } + defer conn.Close() + conn.SetDeadline(deadline) + + out, err := encode(pingMsg, &ping) + if err != nil { + return false, err + } + + if err = m.rawSendMsgStream(conn, out.Bytes()); err != nil { + return false, err + } + + msgType, _, dec, err := m.readStream(conn) + if err != nil { + return false, err + } + + if msgType != ackRespMsg { + return false, fmt.Errorf("Unexpected msgType (%d) from ping %s", msgType, LogConn(conn)) + } + + var ack ackResp + if err = dec.Decode(&ack); err != nil { + return false, err + } + + if ack.SeqNo != ping.SeqNo { + return false, fmt.Errorf("Sequence number from ack (%d) doesn't match ping (%d)", ack.SeqNo, ping.SeqNo, LogConn(conn)) + } + + return true, nil +} diff --git a/vendor/github.com/hashicorp/memberlist/net_test.go b/vendor/github.com/hashicorp/memberlist/net_test.go new file mode 100644 index 000000000..80d3ebb36 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/net_test.go @@ -0,0 +1,787 @@ +package memberlist + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "log" + "net" + "reflect" + "strings" + "testing" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +// As a regression we left this test very low-level and network-ey, even after +// we abstracted the transport. We added some basic network-free transport tests +// in transport_test.go to prove that we didn't hard code some network stuff +// outside of NetTransport. + +func TestHandleCompoundPing(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Encode a ping + ping := ping{SeqNo: 42} + buf, err := encode(pingMsg, ping) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Make a compound message + compound := makeCompoundMessage([][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()}) + + // Send compound version + addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} + udp.WriteTo(compound.Bytes(), addr) + + // Wait for responses + doneCh := make(chan struct{}, 1) + go func() { + select { + case <-doneCh: + case <-time.After(2 * time.Second): + panic("timeout") + } + }() + + for i := 0; i < 3; i++ { + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + msgType := messageType(in[0]) + if msgType != ackRespMsg { + t.Fatalf("bad response %v", in) + } + + var ack ackResp + if err := decode(in[1:], &ack); err != nil { + t.Fatalf("unexpected err %s", err) + } + + if ack.SeqNo != 42 { + t.Fatalf("bad sequence no") + } + } + + doneCh <- struct{}{} +} + +func TestHandlePing(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Encode a ping + ping := ping{SeqNo: 42} + buf, err := encode(pingMsg, ping) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Send + addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} + udp.WriteTo(buf.Bytes(), addr) + + // Wait for response + doneCh := make(chan struct{}, 1) + go func() { + select { + case <-doneCh: + case <-time.After(2 * time.Second): + panic("timeout") + } + }() + + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + msgType := messageType(in[0]) + if msgType != ackRespMsg { + t.Fatalf("bad response %v", in) + } + + var ack ackResp + if err := decode(in[1:], &ack); err != nil { + t.Fatalf("unexpected err %s", err) + } + + if ack.SeqNo != 42 { + t.Fatalf("bad sequence no") + } + + doneCh <- struct{}{} +} + +func TestHandlePing_WrongNode(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Encode a ping, wrong node! + ping := ping{SeqNo: 42, Node: m.config.Name + "-bad"} + buf, err := encode(pingMsg, ping) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Send + addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} + udp.WriteTo(buf.Bytes(), addr) + + // Wait for response + udp.SetDeadline(time.Now().Add(50 * time.Millisecond)) + in := make([]byte, 1500) + _, _, err = udp.ReadFrom(in) + + // Should get an i/o timeout + if err == nil { + t.Fatalf("expected err %s", err) + } +} + +func TestHandleIndirectPing(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Encode an indirect ping + ind := indirectPingReq{ + SeqNo: 100, + Target: net.ParseIP(m.config.BindAddr), + Port: uint16(m.config.BindPort), + } + buf, err := encode(indirectPingMsg, &ind) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Send + addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} + udp.WriteTo(buf.Bytes(), addr) + + // Wait for response + doneCh := make(chan struct{}, 1) + go func() { + select { + case <-doneCh: + case <-time.After(2 * time.Second): + panic("timeout") + } + }() + + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + msgType := messageType(in[0]) + if msgType != ackRespMsg { + t.Fatalf("bad response %v", in) + } + + var ack ackResp + if err := decode(in[1:], &ack); err != nil { + t.Fatalf("unexpected err %s", err) + } + + if ack.SeqNo != 100 { + t.Fatalf("bad sequence no") + } + + doneCh <- struct{}{} +} + +func TestTCPPing(t *testing.T) { + var tcp *net.TCPListener + var tcpAddr *net.TCPAddr + for port := 60000; port < 61000; port++ { + tcpAddr = &net.TCPAddr{IP: net.ParseIP("127.0.0.1"), Port: port} + tcpLn, err := net.ListenTCP("tcp", tcpAddr) + if err == nil { + tcp = tcpLn + break + } + } + if tcp == nil { + t.Fatalf("no tcp listener") + } + + // Note that tcp gets closed in the last test, so we avoid a deferred + // Close() call here. + + m := GetMemberlist(t) + defer m.Shutdown() + pingTimeout := m.config.ProbeInterval + pingTimeMax := m.config.ProbeInterval + 10*time.Millisecond + + // Do a normal round trip. + pingOut := ping{SeqNo: 23, Node: "mongo"} + go func() { + tcp.SetDeadline(time.Now().Add(pingTimeMax)) + conn, err := tcp.AcceptTCP() + if err != nil { + t.Fatalf("failed to connect: %s", err) + } + defer conn.Close() + + msgType, _, dec, err := m.readStream(conn) + if err != nil { + t.Fatalf("failed to read ping: %s", err) + } + + if msgType != pingMsg { + t.Fatalf("expecting ping, got message type (%d)", msgType) + } + + var pingIn ping + if err := dec.Decode(&pingIn); err != nil { + t.Fatalf("failed to decode ping: %s", err) + } + + if pingIn.SeqNo != pingOut.SeqNo { + t.Fatalf("sequence number isn't correct (%d) vs (%d)", pingIn.SeqNo, pingOut.SeqNo) + } + + if pingIn.Node != pingOut.Node { + t.Fatalf("node name isn't correct (%s) vs (%s)", pingIn.Node, pingOut.Node) + } + + ack := ackResp{pingIn.SeqNo, nil} + out, err := encode(ackRespMsg, &ack) + if err != nil { + t.Fatalf("failed to encode ack: %s", err) + } + + err = m.rawSendMsgStream(conn, out.Bytes()) + if err != nil { + t.Fatalf("failed to send ack: %s", err) + } + }() + deadline := time.Now().Add(pingTimeout) + didContact, err := m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) + if err != nil { + t.Fatalf("error trying to ping: %s", err) + } + if !didContact { + t.Fatalf("expected successful ping") + } + + // Make sure a mis-matched sequence number is caught. + go func() { + tcp.SetDeadline(time.Now().Add(pingTimeMax)) + conn, err := tcp.AcceptTCP() + if err != nil { + t.Fatalf("failed to connect: %s", err) + } + defer conn.Close() + + _, _, dec, err := m.readStream(conn) + if err != nil { + t.Fatalf("failed to read ping: %s", err) + } + + var pingIn ping + if err := dec.Decode(&pingIn); err != nil { + t.Fatalf("failed to decode ping: %s", err) + } + + ack := ackResp{pingIn.SeqNo + 1, nil} + out, err := encode(ackRespMsg, &ack) + if err != nil { + t.Fatalf("failed to encode ack: %s", err) + } + + err = m.rawSendMsgStream(conn, out.Bytes()) + if err != nil { + t.Fatalf("failed to send ack: %s", err) + } + }() + deadline = time.Now().Add(pingTimeout) + didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) + if err == nil || !strings.Contains(err.Error(), "Sequence number") { + t.Fatalf("expected an error from mis-matched sequence number") + } + if didContact { + t.Fatalf("expected failed ping") + } + + // Make sure an unexpected message type is handled gracefully. + go func() { + tcp.SetDeadline(time.Now().Add(pingTimeMax)) + conn, err := tcp.AcceptTCP() + if err != nil { + t.Fatalf("failed to connect: %s", err) + } + defer conn.Close() + + _, _, _, err = m.readStream(conn) + if err != nil { + t.Fatalf("failed to read ping: %s", err) + } + + bogus := indirectPingReq{} + out, err := encode(indirectPingMsg, &bogus) + if err != nil { + t.Fatalf("failed to encode bogus msg: %s", err) + } + + err = m.rawSendMsgStream(conn, out.Bytes()) + if err != nil { + t.Fatalf("failed to send bogus msg: %s", err) + } + }() + deadline = time.Now().Add(pingTimeout) + didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) + if err == nil || !strings.Contains(err.Error(), "Unexpected msgType") { + t.Fatalf("expected an error from bogus message") + } + if didContact { + t.Fatalf("expected failed ping") + } + + // Make sure failed I/O respects the deadline. In this case we try the + // common case of the receiving node being totally down. + tcp.Close() + deadline = time.Now().Add(pingTimeout) + startPing := time.Now() + didContact, err = m.sendPingAndWaitForAck(tcpAddr.String(), pingOut, deadline) + pingTime := time.Now().Sub(startPing) + if err != nil { + t.Fatalf("expected no error during ping on closed socket, got: %s", err) + } + if didContact { + t.Fatalf("expected failed ping") + } + if pingTime > pingTimeMax { + t.Fatalf("took too long to fail ping, %9.6f", pingTime.Seconds()) + } +} + +func TestTCPPushPull(t *testing.T) { + m := GetMemberlist(t) + defer m.Shutdown() + m.nodes = append(m.nodes, &nodeState{ + Node: Node{ + Name: "Test 0", + Addr: net.ParseIP(m.config.BindAddr), + Port: uint16(m.config.BindPort), + }, + Incarnation: 0, + State: stateSuspect, + StateChange: time.Now().Add(-1 * time.Second), + }) + + addr := fmt.Sprintf("%s:%d", m.config.BindAddr, m.config.BindPort) + conn, err := net.Dial("tcp", addr) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + defer conn.Close() + + localNodes := make([]pushNodeState, 3) + localNodes[0].Name = "Test 0" + localNodes[0].Addr = net.ParseIP(m.config.BindAddr) + localNodes[0].Port = uint16(m.config.BindPort) + localNodes[0].Incarnation = 1 + localNodes[0].State = stateAlive + localNodes[1].Name = "Test 1" + localNodes[1].Addr = net.ParseIP(m.config.BindAddr) + localNodes[1].Port = uint16(m.config.BindPort) + localNodes[1].Incarnation = 1 + localNodes[1].State = stateAlive + localNodes[2].Name = "Test 2" + localNodes[2].Addr = net.ParseIP(m.config.BindAddr) + localNodes[2].Port = uint16(m.config.BindPort) + localNodes[2].Incarnation = 1 + localNodes[2].State = stateAlive + + // Send our node state + header := pushPullHeader{Nodes: 3} + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(conn, &hd) + + // Send the push/pull indicator + conn.Write([]byte{byte(pushPullMsg)}) + + if err := enc.Encode(&header); err != nil { + t.Fatalf("unexpected err %s", err) + } + for i := 0; i < header.Nodes; i++ { + if err := enc.Encode(&localNodes[i]); err != nil { + t.Fatalf("unexpected err %s", err) + } + } + + // Read the message type + var msgType messageType + if err := binary.Read(conn, binary.BigEndian, &msgType); err != nil { + t.Fatalf("unexpected err %s", err) + } + + var bufConn io.Reader = conn + msghd := codec.MsgpackHandle{} + dec := codec.NewDecoder(bufConn, &msghd) + + // Check if we have a compressed message + if msgType == compressMsg { + var c compress + if err := dec.Decode(&c); err != nil { + t.Fatalf("unexpected err %s", err) + } + decomp, err := decompressBuffer(&c) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Reset the message type + msgType = messageType(decomp[0]) + + // Create a new bufConn + bufConn = bytes.NewReader(decomp[1:]) + + // Create a new decoder + dec = codec.NewDecoder(bufConn, &hd) + } + + // Quit if not push/pull + if msgType != pushPullMsg { + t.Fatalf("bad message type") + } + + if err := dec.Decode(&header); err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Allocate space for the transfer + remoteNodes := make([]pushNodeState, header.Nodes) + + // Try to decode all the states + for i := 0; i < header.Nodes; i++ { + if err := dec.Decode(&remoteNodes[i]); err != nil { + t.Fatalf("unexpected err %s", err) + } + } + + if len(remoteNodes) != 1 { + t.Fatalf("bad response") + } + + n := &remoteNodes[0] + if n.Name != "Test 0" { + t.Fatalf("bad name") + } + if bytes.Compare(n.Addr, net.ParseIP(m.config.BindAddr)) != 0 { + t.Fatal("bad addr") + } + if n.Incarnation != 0 { + t.Fatal("bad incarnation") + } + if n.State != stateSuspect { + t.Fatal("bad state") + } +} + +func TestSendMsg_Piggyback(t *testing.T) { + m := GetMemberlist(t) + defer m.Shutdown() + + // Add a message to be broadcast + a := alive{ + Incarnation: 10, + Node: "rand", + Addr: []byte{127, 0, 0, 255}, + Meta: nil, + } + m.encodeAndBroadcast("rand", aliveMsg, &a) + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + // Encode a ping + ping := ping{SeqNo: 42} + buf, err := encode(pingMsg, ping) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + // Send + addr := &net.UDPAddr{IP: net.ParseIP(m.config.BindAddr), Port: m.config.BindPort} + udp.WriteTo(buf.Bytes(), addr) + + // Wait for response + doneCh := make(chan struct{}, 1) + go func() { + select { + case <-doneCh: + case <-time.After(2 * time.Second): + panic("timeout") + } + }() + + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + msgType := messageType(in[0]) + if msgType != compoundMsg { + t.Fatalf("bad response %v", in) + } + + // get the parts + trunc, parts, err := decodeCompoundMessage(in[1:]) + if trunc != 0 { + t.Fatalf("unexpected truncation") + } + if len(parts) != 2 { + t.Fatalf("unexpected parts %v", parts) + } + if err != nil { + t.Fatalf("unexpected err %s", err) + } + + var ack ackResp + if err := decode(parts[0][1:], &ack); err != nil { + t.Fatalf("unexpected err %s", err) + } + + if ack.SeqNo != 42 { + t.Fatalf("bad sequence no") + } + + var aliveout alive + if err := decode(parts[1][1:], &aliveout); err != nil { + t.Fatalf("unexpected err %s", err) + } + + if aliveout.Node != "rand" || aliveout.Incarnation != 10 { + t.Fatalf("bad mesg") + } + + doneCh <- struct{}{} +} + +func TestEncryptDecryptState(t *testing.T) { + state := []byte("this is our internal state...") + config := &Config{ + SecretKey: []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + ProtocolVersion: ProtocolVersionMax, + } + + m, err := Create(config) + if err != nil { + t.Fatalf("err: %s", err) + } + defer m.Shutdown() + + crypt, err := m.encryptLocalState(state) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create reader, seek past the type byte + buf := bytes.NewReader(crypt) + buf.Seek(1, 0) + + plain, err := m.decryptRemoteState(buf) + if err != nil { + t.Fatalf("err: %v", err) + } + + if !reflect.DeepEqual(state, plain) { + t.Fatalf("Decrypt failed: %v", plain) + } +} + +func TestRawSendUdp_CRC(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Pass a nil node with no nodes registered, should result in no checksum + payload := []byte{3, 3, 3, 3} + m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) + + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + if len(in) != 4 { + t.Fatalf("bad: %v", in) + } + + // Pass a non-nil node with PMax >= 5, should result in a checksum + m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) + + in = make([]byte, 1500) + n, _, err = udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + if len(in) != 9 { + t.Fatalf("bad: %v", in) + } + + // Register a node with PMax >= 5 to be looked up, should result in a checksum + m.nodeMap["127.0.0.1"] = &nodeState{ + Node: Node{PMax: 5}, + } + m.rawSendMsgPacket(udp.LocalAddr().String(), nil, payload) + + in = make([]byte, 1500) + n, _, err = udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + if len(in) != 9 { + t.Fatalf("bad: %v", in) + } +} + +func TestIngestPacket_CRC(t *testing.T) { + m := GetMemberlist(t) + m.config.EnableCompression = false + defer m.Shutdown() + + var udp *net.UDPConn + for port := 60000; port < 61000; port++ { + udpAddr := fmt.Sprintf("127.0.0.1:%d", port) + udpLn, err := net.ListenPacket("udp", udpAddr) + if err == nil { + udp = udpLn.(*net.UDPConn) + break + } + } + + if udp == nil { + t.Fatalf("no udp listener") + } + + // Get a message with a checksum + payload := []byte{3, 3, 3, 3} + m.rawSendMsgPacket(udp.LocalAddr().String(), &Node{PMax: 5}, payload) + + in := make([]byte, 1500) + n, _, err := udp.ReadFrom(in) + if err != nil { + t.Fatalf("unexpected err %s", err) + } + in = in[0:n] + + if len(in) != 9 { + t.Fatalf("bad: %v", in) + } + + // Corrupt the checksum + in[1] <<= 1 + + logs := &bytes.Buffer{} + logger := log.New(logs, "", 0) + m.logger = logger + m.ingestPacket(in, udp.LocalAddr(), time.Now()) + + if !strings.Contains(logs.String(), "invalid checksum") { + t.Fatalf("bad: %s", logs.String()) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/net_transport.go b/vendor/github.com/hashicorp/memberlist/net_transport.go new file mode 100644 index 000000000..e7b88b01f --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/net_transport.go @@ -0,0 +1,289 @@ +package memberlist + +import ( + "fmt" + "log" + "net" + "sync" + "sync/atomic" + "time" + + "github.com/armon/go-metrics" + sockaddr "github.com/hashicorp/go-sockaddr" +) + +const ( + // udpPacketBufSize is used to buffer incoming packets during read + // operations. + udpPacketBufSize = 65536 + + // udpRecvBufSize is a large buffer size that we attempt to set UDP + // sockets to in order to handle a large volume of messages. + udpRecvBufSize = 2 * 1024 * 1024 +) + +// NetTransportConfig is used to configure a net transport. +type NetTransportConfig struct { + // BindAddrs is a list of addresses to bind to for both TCP and UDP + // communications. + BindAddrs []string + + // BindPort is the port to listen on, for each address above. + BindPort int + + // Logger is a logger for operator messages. + Logger *log.Logger +} + +// NetTransport is a Transport implementation that uses connectionless UDP for +// packet operations, and ad-hoc TCP connections for stream operations. +type NetTransport struct { + config *NetTransportConfig + packetCh chan *Packet + streamCh chan net.Conn + logger *log.Logger + wg sync.WaitGroup + tcpListeners []*net.TCPListener + udpListeners []*net.UDPConn + shutdown int32 +} + +// NewNetTransport returns a net transport with the given configuration. On +// success all the network listeners will be created and listening. +func NewNetTransport(config *NetTransportConfig) (*NetTransport, error) { + // If we reject the empty list outright we can assume that there's at + // least one listener of each type later during operation. + if len(config.BindAddrs) == 0 { + return nil, fmt.Errorf("At least one bind address is required") + } + + // Build out the new transport. + var ok bool + t := NetTransport{ + config: config, + packetCh: make(chan *Packet), + streamCh: make(chan net.Conn), + logger: config.Logger, + } + + // Clean up listeners if there's an error. + defer func() { + if !ok { + t.Shutdown() + } + }() + + // Build all the TCP and UDP listeners. + port := config.BindPort + for _, addr := range config.BindAddrs { + ip := net.ParseIP(addr) + + tcpAddr := &net.TCPAddr{IP: ip, Port: port} + tcpLn, err := net.ListenTCP("tcp", tcpAddr) + if err != nil { + return nil, fmt.Errorf("Failed to start TCP listener on %q port %d: %v", addr, port, err) + } + t.tcpListeners = append(t.tcpListeners, tcpLn) + + // If the config port given was zero, use the first TCP listener + // to pick an available port and then apply that to everything + // else. + if port == 0 { + port = tcpLn.Addr().(*net.TCPAddr).Port + } + + udpAddr := &net.UDPAddr{IP: ip, Port: port} + udpLn, err := net.ListenUDP("udp", udpAddr) + if err != nil { + return nil, fmt.Errorf("Failed to start UDP listener on %q port %d: %v", addr, port, err) + } + if err := setUDPRecvBuf(udpLn); err != nil { + return nil, fmt.Errorf("Failed to resize UDP buffer: %v", err) + } + t.udpListeners = append(t.udpListeners, udpLn) + } + + // Fire them up now that we've been able to create them all. + for i := 0; i < len(config.BindAddrs); i++ { + t.wg.Add(2) + go t.tcpListen(t.tcpListeners[i]) + go t.udpListen(t.udpListeners[i]) + } + + ok = true + return &t, nil +} + +// GetAutoBindPort returns the bind port that was automatically given by the +// kernel, if a bind port of 0 was given. +func (t *NetTransport) GetAutoBindPort() int { + // We made sure there's at least one TCP listener, and that one's + // port was applied to all the others for the dynamic bind case. + return t.tcpListeners[0].Addr().(*net.TCPAddr).Port +} + +// See Transport. +func (t *NetTransport) FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) { + var advertiseAddr net.IP + var advertisePort int + if ip != "" { + // If they've supplied an address, use that. + advertiseAddr = net.ParseIP(ip) + if advertiseAddr == nil { + return nil, 0, fmt.Errorf("Failed to parse advertise address %q", ip) + } + + // Ensure IPv4 conversion if necessary. + if ip4 := advertiseAddr.To4(); ip4 != nil { + advertiseAddr = ip4 + } + advertisePort = port + } else { + if t.config.BindAddrs[0] == "0.0.0.0" { + // Otherwise, if we're not bound to a specific IP, let's + // use a suitable private IP address. + var err error + ip, err = sockaddr.GetPrivateIP() + if err != nil { + return nil, 0, fmt.Errorf("Failed to get interface addresses: %v", err) + } + if ip == "" { + return nil, 0, fmt.Errorf("No private IP address found, and explicit IP not provided") + } + + advertiseAddr = net.ParseIP(ip) + if advertiseAddr == nil { + return nil, 0, fmt.Errorf("Failed to parse advertise address: %q", ip) + } + } else { + // Use the IP that we're bound to, based on the first + // TCP listener, which we already ensure is there. + advertiseAddr = t.tcpListeners[0].Addr().(*net.TCPAddr).IP + } + + // Use the port we are bound to. + advertisePort = t.GetAutoBindPort() + } + + return advertiseAddr, advertisePort, nil +} + +// See Transport. +func (t *NetTransport) WriteTo(b []byte, addr string) (time.Time, error) { + udpAddr, err := net.ResolveUDPAddr("udp", addr) + if err != nil { + return time.Time{}, err + } + + // We made sure there's at least one UDP listener, so just use the + // packet sending interface on the first one. Take the time after the + // write call comes back, which will underestimate the time a little, + // but help account for any delays before the write occurs. + _, err = t.udpListeners[0].WriteTo(b, udpAddr) + return time.Now(), err +} + +// See Transport. +func (t *NetTransport) PacketCh() <-chan *Packet { + return t.packetCh +} + +// See Transport. +func (t *NetTransport) DialTimeout(addr string, timeout time.Duration) (net.Conn, error) { + dialer := net.Dialer{Timeout: timeout} + return dialer.Dial("tcp", addr) +} + +// See Transport. +func (t *NetTransport) StreamCh() <-chan net.Conn { + return t.streamCh +} + +// See Transport. +func (t *NetTransport) Shutdown() error { + // This will avoid log spam about errors when we shut down. + atomic.StoreInt32(&t.shutdown, 1) + + // Rip through all the connections and shut them down. + for _, conn := range t.tcpListeners { + conn.Close() + } + for _, conn := range t.udpListeners { + conn.Close() + } + + // Block until all the listener threads have died. + t.wg.Wait() + return nil +} + +// tcpListen is a long running goroutine that accepts incoming TCP connections +// and hands them off to the stream channel. +func (t *NetTransport) tcpListen(tcpLn *net.TCPListener) { + defer t.wg.Done() + for { + conn, err := tcpLn.AcceptTCP() + if err != nil { + if s := atomic.LoadInt32(&t.shutdown); s == 1 { + break + } + + t.logger.Printf("[ERR] memberlist: Error accepting TCP connection: %v", err) + continue + } + + t.streamCh <- conn + } +} + +// udpListen is a long running goroutine that accepts incoming UDP packets and +// hands them off to the packet channel. +func (t *NetTransport) udpListen(udpLn *net.UDPConn) { + defer t.wg.Done() + for { + // Do a blocking read into a fresh buffer. Grab a time stamp as + // close as possible to the I/O. + buf := make([]byte, udpPacketBufSize) + n, addr, err := udpLn.ReadFrom(buf) + ts := time.Now() + if err != nil { + if s := atomic.LoadInt32(&t.shutdown); s == 1 { + break + } + + t.logger.Printf("[ERR] memberlist: Error reading UDP packet: %v", err) + continue + } + + // Check the length - it needs to have at least one byte to be a + // proper message. + if n < 1 { + t.logger.Printf("[ERR] memberlist: UDP packet too short (%d bytes) %s", + len(buf), LogAddress(addr)) + continue + } + + // Ingest the packet. + metrics.IncrCounter([]string{"memberlist", "udp", "received"}, float32(n)) + t.packetCh <- &Packet{ + Buf: buf[:n], + From: addr, + Timestamp: ts, + } + } +} + +// setUDPRecvBuf is used to resize the UDP receive window. The function +// attempts to set the read buffer to `udpRecvBuf` but backs off until +// the read buffer can be set. +func setUDPRecvBuf(c *net.UDPConn) error { + size := udpRecvBufSize + var err error + for size > 0 { + if err = c.SetReadBuffer(size); err == nil { + return nil + } + size = size / 2 + } + return err +} diff --git a/vendor/github.com/hashicorp/memberlist/ping_delegate.go b/vendor/github.com/hashicorp/memberlist/ping_delegate.go new file mode 100644 index 000000000..1566c8b3d --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/ping_delegate.go @@ -0,0 +1,14 @@ +package memberlist + +import "time" + +// PingDelegate is used to notify an observer how long it took for a ping message to +// complete a round trip. It can also be used for writing arbitrary byte slices +// into ack messages. Note that in order to be meaningful for RTT estimates, this +// delegate does not apply to indirect pings, nor fallback pings sent over TCP. +type PingDelegate interface { + // AckPayload is invoked when an ack is being sent; the returned bytes will be appended to the ack + AckPayload() []byte + // NotifyPing is invoked when an ack for a ping is received + NotifyPingComplete(other *Node, rtt time.Duration, payload []byte) +} diff --git a/vendor/github.com/hashicorp/memberlist/queue.go b/vendor/github.com/hashicorp/memberlist/queue.go new file mode 100644 index 000000000..994b90ff1 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/queue.go @@ -0,0 +1,167 @@ +package memberlist + +import ( + "sort" + "sync" +) + +// TransmitLimitedQueue is used to queue messages to broadcast to +// the cluster (via gossip) but limits the number of transmits per +// message. It also prioritizes messages with lower transmit counts +// (hence newer messages). +type TransmitLimitedQueue struct { + // NumNodes returns the number of nodes in the cluster. This is + // used to determine the retransmit count, which is calculated + // based on the log of this. + NumNodes func() int + + // RetransmitMult is the multiplier used to determine the maximum + // number of retransmissions attempted. + RetransmitMult int + + sync.Mutex + bcQueue limitedBroadcasts +} + +type limitedBroadcast struct { + transmits int // Number of transmissions attempted. + b Broadcast +} +type limitedBroadcasts []*limitedBroadcast + +// Broadcast is something that can be broadcasted via gossip to +// the memberlist cluster. +type Broadcast interface { + // Invalidates checks if enqueuing the current broadcast + // invalidates a previous broadcast + Invalidates(b Broadcast) bool + + // Returns a byte form of the message + Message() []byte + + // Finished is invoked when the message will no longer + // be broadcast, either due to invalidation or to the + // transmit limit being reached + Finished() +} + +// QueueBroadcast is used to enqueue a broadcast +func (q *TransmitLimitedQueue) QueueBroadcast(b Broadcast) { + q.Lock() + defer q.Unlock() + + // Check if this message invalidates another + n := len(q.bcQueue) + for i := 0; i < n; i++ { + if b.Invalidates(q.bcQueue[i].b) { + q.bcQueue[i].b.Finished() + copy(q.bcQueue[i:], q.bcQueue[i+1:]) + q.bcQueue[n-1] = nil + q.bcQueue = q.bcQueue[:n-1] + n-- + } + } + + // Append to the queue + q.bcQueue = append(q.bcQueue, &limitedBroadcast{0, b}) +} + +// GetBroadcasts is used to get a number of broadcasts, up to a byte limit +// and applying a per-message overhead as provided. +func (q *TransmitLimitedQueue) GetBroadcasts(overhead, limit int) [][]byte { + q.Lock() + defer q.Unlock() + + // Fast path the default case + if len(q.bcQueue) == 0 { + return nil + } + + transmitLimit := retransmitLimit(q.RetransmitMult, q.NumNodes()) + bytesUsed := 0 + var toSend [][]byte + + for i := len(q.bcQueue) - 1; i >= 0; i-- { + // Check if this is within our limits + b := q.bcQueue[i] + msg := b.b.Message() + if bytesUsed+overhead+len(msg) > limit { + continue + } + + // Add to slice to send + bytesUsed += overhead + len(msg) + toSend = append(toSend, msg) + + // Check if we should stop transmission + b.transmits++ + if b.transmits >= transmitLimit { + b.b.Finished() + n := len(q.bcQueue) + q.bcQueue[i], q.bcQueue[n-1] = q.bcQueue[n-1], nil + q.bcQueue = q.bcQueue[:n-1] + } + } + + // If we are sending anything, we need to re-sort to deal + // with adjusted transmit counts + if len(toSend) > 0 { + q.bcQueue.Sort() + } + return toSend +} + +// NumQueued returns the number of queued messages +func (q *TransmitLimitedQueue) NumQueued() int { + q.Lock() + defer q.Unlock() + return len(q.bcQueue) +} + +// Reset clears all the queued messages +func (q *TransmitLimitedQueue) Reset() { + q.Lock() + defer q.Unlock() + for _, b := range q.bcQueue { + b.b.Finished() + } + q.bcQueue = nil +} + +// Prune will retain the maxRetain latest messages, and the rest +// will be discarded. This can be used to prevent unbounded queue sizes +func (q *TransmitLimitedQueue) Prune(maxRetain int) { + q.Lock() + defer q.Unlock() + + // Do nothing if queue size is less than the limit + n := len(q.bcQueue) + if n < maxRetain { + return + } + + // Invalidate the messages we will be removing + for i := 0; i < n-maxRetain; i++ { + q.bcQueue[i].b.Finished() + } + + // Move the messages, and retain only the last maxRetain + copy(q.bcQueue[0:], q.bcQueue[n-maxRetain:]) + q.bcQueue = q.bcQueue[:maxRetain] +} + +func (b limitedBroadcasts) Len() int { + return len(b) +} + +func (b limitedBroadcasts) Less(i, j int) bool { + return b[i].transmits < b[j].transmits +} + +func (b limitedBroadcasts) Swap(i, j int) { + b[i], b[j] = b[j], b[i] +} + +func (b limitedBroadcasts) Sort() { + sort.Sort(sort.Reverse(b)) +} diff --git a/vendor/github.com/hashicorp/memberlist/queue_test.go b/vendor/github.com/hashicorp/memberlist/queue_test.go new file mode 100644 index 000000000..765a3b53d --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/queue_test.go @@ -0,0 +1,172 @@ +package memberlist + +import ( + "testing" +) + +func TestTransmitLimited_Queue(t *testing.T) { + q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 1 }} + q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) + q.QueueBroadcast(&memberlistBroadcast{"foo", nil, nil}) + q.QueueBroadcast(&memberlistBroadcast{"bar", nil, nil}) + + if len(q.bcQueue) != 3 { + t.Fatalf("bad len") + } + if q.bcQueue[0].b.(*memberlistBroadcast).node != "test" { + t.Fatalf("missing test") + } + if q.bcQueue[1].b.(*memberlistBroadcast).node != "foo" { + t.Fatalf("missing foo") + } + if q.bcQueue[2].b.(*memberlistBroadcast).node != "bar" { + t.Fatalf("missing bar") + } + + // Should invalidate previous message + q.QueueBroadcast(&memberlistBroadcast{"test", nil, nil}) + + if len(q.bcQueue) != 3 { + t.Fatalf("bad len") + } + if q.bcQueue[0].b.(*memberlistBroadcast).node != "foo" { + t.Fatalf("missing foo") + } + if q.bcQueue[1].b.(*memberlistBroadcast).node != "bar" { + t.Fatalf("missing bar") + } + if q.bcQueue[2].b.(*memberlistBroadcast).node != "test" { + t.Fatalf("missing test") + } +} + +func TestTransmitLimited_GetBroadcasts(t *testing.T) { + q := &TransmitLimitedQueue{RetransmitMult: 3, NumNodes: func() int { return 10 }} + + // 18 bytes per message + q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) + + // 2 byte overhead per message, should get all 4 messages + all := q.GetBroadcasts(2, 80) + if len(all) != 4 { + t.Fatalf("missing messages: %v", all) + } + + // 3 byte overhead, should only get 3 messages back + partial := q.GetBroadcasts(3, 80) + if len(partial) != 3 { + t.Fatalf("missing messages: %v", partial) + } +} + +func TestTransmitLimited_GetBroadcasts_Limit(t *testing.T) { + q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} + + // 18 bytes per message + q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) + + // 3 byte overhead, should only get 3 messages back + partial1 := q.GetBroadcasts(3, 80) + if len(partial1) != 3 { + t.Fatalf("missing messages: %v", partial1) + } + + partial2 := q.GetBroadcasts(3, 80) + if len(partial2) != 3 { + t.Fatalf("missing messages: %v", partial2) + } + + // Only two not expired + partial3 := q.GetBroadcasts(3, 80) + if len(partial3) != 2 { + t.Fatalf("missing messages: %v", partial3) + } + + // Should get nothing + partial5 := q.GetBroadcasts(3, 80) + if len(partial5) != 0 { + t.Fatalf("missing messages: %v", partial5) + } +} + +func TestTransmitLimited_Prune(t *testing.T) { + q := &TransmitLimitedQueue{RetransmitMult: 1, NumNodes: func() int { return 10 }} + + ch1 := make(chan struct{}, 1) + ch2 := make(chan struct{}, 1) + + // 18 bytes per message + q.QueueBroadcast(&memberlistBroadcast{"test", []byte("1. this is a test."), ch1}) + q.QueueBroadcast(&memberlistBroadcast{"foo", []byte("2. this is a test."), ch2}) + q.QueueBroadcast(&memberlistBroadcast{"bar", []byte("3. this is a test."), nil}) + q.QueueBroadcast(&memberlistBroadcast{"baz", []byte("4. this is a test."), nil}) + + // Keep only 2 + q.Prune(2) + + if q.NumQueued() != 2 { + t.Fatalf("bad len") + } + + // Should notify the first two + select { + case <-ch1: + default: + t.Fatalf("expected invalidation") + } + select { + case <-ch2: + default: + t.Fatalf("expected invalidation") + } + + if q.bcQueue[0].b.(*memberlistBroadcast).node != "bar" { + t.Fatalf("missing bar") + } + if q.bcQueue[1].b.(*memberlistBroadcast).node != "baz" { + t.Fatalf("missing baz") + } +} + +func TestLimitedBroadcastSort(t *testing.T) { + bc := limitedBroadcasts([]*limitedBroadcast{ + &limitedBroadcast{ + transmits: 0, + }, + &limitedBroadcast{ + transmits: 10, + }, + &limitedBroadcast{ + transmits: 3, + }, + &limitedBroadcast{ + transmits: 4, + }, + &limitedBroadcast{ + transmits: 7, + }, + }) + bc.Sort() + + if bc[0].transmits != 10 { + t.Fatalf("bad val %v", bc[0]) + } + if bc[1].transmits != 7 { + t.Fatalf("bad val %v", bc[7]) + } + if bc[2].transmits != 4 { + t.Fatalf("bad val %v", bc[2]) + } + if bc[3].transmits != 3 { + t.Fatalf("bad val %v", bc[3]) + } + if bc[4].transmits != 0 { + t.Fatalf("bad val %v", bc[4]) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/security.go b/vendor/github.com/hashicorp/memberlist/security.go new file mode 100644 index 000000000..d90114eb0 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/security.go @@ -0,0 +1,198 @@ +package memberlist + +import ( + "bytes" + "crypto/aes" + "crypto/cipher" + "crypto/rand" + "fmt" + "io" +) + +/* + +Encrypted messages are prefixed with an encryptionVersion byte +that is used for us to be able to properly encode/decode. We +currently support the following versions: + + 0 - AES-GCM 128, using PKCS7 padding + 1 - AES-GCM 128, no padding. Padding not needed, caused bloat. + +*/ +type encryptionVersion uint8 + +const ( + minEncryptionVersion encryptionVersion = 0 + maxEncryptionVersion encryptionVersion = 1 +) + +const ( + versionSize = 1 + nonceSize = 12 + tagSize = 16 + maxPadOverhead = 16 + blockSize = aes.BlockSize +) + +// pkcs7encode is used to pad a byte buffer to a specific block size using +// the PKCS7 algorithm. "Ignores" some bytes to compensate for IV +func pkcs7encode(buf *bytes.Buffer, ignore, blockSize int) { + n := buf.Len() - ignore + more := blockSize - (n % blockSize) + for i := 0; i < more; i++ { + buf.WriteByte(byte(more)) + } +} + +// pkcs7decode is used to decode a buffer that has been padded +func pkcs7decode(buf []byte, blockSize int) []byte { + if len(buf) == 0 { + panic("Cannot decode a PKCS7 buffer of zero length") + } + n := len(buf) + last := buf[n-1] + n -= int(last) + return buf[:n] +} + +// encryptOverhead returns the maximum possible overhead of encryption by version +func encryptOverhead(vsn encryptionVersion) int { + switch vsn { + case 0: + return 45 // Version: 1, IV: 12, Padding: 16, Tag: 16 + case 1: + return 29 // Version: 1, IV: 12, Tag: 16 + default: + panic("unsupported version") + } +} + +// encryptedLength is used to compute the buffer size needed +// for a message of given length +func encryptedLength(vsn encryptionVersion, inp int) int { + // If we are on version 1, there is no padding + if vsn >= 1 { + return versionSize + nonceSize + inp + tagSize + } + + // Determine the padding size + padding := blockSize - (inp % blockSize) + + // Sum the extra parts to get total size + return versionSize + nonceSize + inp + padding + tagSize +} + +// encryptPayload is used to encrypt a message with a given key. +// We make use of AES-128 in GCM mode. New byte buffer is the version, +// nonce, ciphertext and tag +func encryptPayload(vsn encryptionVersion, key []byte, msg []byte, data []byte, dst *bytes.Buffer) error { + // Get the AES block cipher + aesBlock, err := aes.NewCipher(key) + if err != nil { + return err + } + + // Get the GCM cipher mode + gcm, err := cipher.NewGCM(aesBlock) + if err != nil { + return err + } + + // Grow the buffer to make room for everything + offset := dst.Len() + dst.Grow(encryptedLength(vsn, len(msg))) + + // Write the encryption version + dst.WriteByte(byte(vsn)) + + // Add a random nonce + io.CopyN(dst, rand.Reader, nonceSize) + afterNonce := dst.Len() + + // Ensure we are correctly padded (only version 0) + if vsn == 0 { + io.Copy(dst, bytes.NewReader(msg)) + pkcs7encode(dst, offset+versionSize+nonceSize, aes.BlockSize) + } + + // Encrypt message using GCM + slice := dst.Bytes()[offset:] + nonce := slice[versionSize : versionSize+nonceSize] + + // Message source depends on the encryption version. + // Version 0 uses padding, version 1 does not + var src []byte + if vsn == 0 { + src = slice[versionSize+nonceSize:] + } else { + src = msg + } + out := gcm.Seal(nil, nonce, src, data) + + // Truncate the plaintext, and write the cipher text + dst.Truncate(afterNonce) + dst.Write(out) + return nil +} + +// decryptMessage performs the actual decryption of ciphertext. This is in its +// own function to allow it to be called on all keys easily. +func decryptMessage(key, msg []byte, data []byte) ([]byte, error) { + // Get the AES block cipher + aesBlock, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + + // Get the GCM cipher mode + gcm, err := cipher.NewGCM(aesBlock) + if err != nil { + return nil, err + } + + // Decrypt the message + nonce := msg[versionSize : versionSize+nonceSize] + ciphertext := msg[versionSize+nonceSize:] + plain, err := gcm.Open(nil, nonce, ciphertext, data) + if err != nil { + return nil, err + } + + // Success! + return plain, nil +} + +// decryptPayload is used to decrypt a message with a given key, +// and verify it's contents. Any padding will be removed, and a +// slice to the plaintext is returned. Decryption is done IN PLACE! +func decryptPayload(keys [][]byte, msg []byte, data []byte) ([]byte, error) { + // Ensure we have at least one byte + if len(msg) == 0 { + return nil, fmt.Errorf("Cannot decrypt empty payload") + } + + // Verify the version + vsn := encryptionVersion(msg[0]) + if vsn > maxEncryptionVersion { + return nil, fmt.Errorf("Unsupported encryption version %d", msg[0]) + } + + // Ensure the length is sane + if len(msg) < encryptedLength(vsn, 0) { + return nil, fmt.Errorf("Payload is too small to decrypt: %d", len(msg)) + } + + for _, key := range keys { + plain, err := decryptMessage(key, msg, data) + if err == nil { + // Remove the PKCS7 padding for vsn 0 + if vsn == 0 { + return pkcs7decode(plain, aes.BlockSize), nil + } else { + return plain, nil + } + } + } + + return nil, fmt.Errorf("No installed keys could decrypt the message") +} diff --git a/vendor/github.com/hashicorp/memberlist/security_test.go b/vendor/github.com/hashicorp/memberlist/security_test.go new file mode 100644 index 000000000..15fa4aa8e --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/security_test.go @@ -0,0 +1,70 @@ +package memberlist + +import ( + "bytes" + "reflect" + "testing" +) + +func TestPKCS7(t *testing.T) { + for i := 0; i <= 255; i++ { + // Make a buffer of size i + buf := []byte{} + for j := 0; j < i; j++ { + buf = append(buf, byte(i)) + } + + // Copy to bytes buffer + inp := bytes.NewBuffer(nil) + inp.Write(buf) + + // Pad this out + pkcs7encode(inp, 0, 16) + + // Unpad + dec := pkcs7decode(inp.Bytes(), 16) + + // Ensure equivilence + if !reflect.DeepEqual(buf, dec) { + t.Fatalf("mismatch: %v %v", buf, dec) + } + } + +} + +func TestEncryptDecrypt_V0(t *testing.T) { + encryptDecryptVersioned(0, t) +} + +func TestEncryptDecrypt_V1(t *testing.T) { + encryptDecryptVersioned(1, t) +} + +func encryptDecryptVersioned(vsn encryptionVersion, t *testing.T) { + k1 := []byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15} + plaintext := []byte("this is a plain text message") + extra := []byte("random data") + + var buf bytes.Buffer + err := encryptPayload(vsn, k1, plaintext, extra, &buf) + if err != nil { + t.Fatalf("err: %v", err) + } + + expLen := encryptedLength(vsn, len(plaintext)) + if buf.Len() != expLen { + t.Fatalf("output length is unexpected %d %d %d", len(plaintext), buf.Len(), expLen) + } + + msg, err := decryptPayload([][]byte{k1}, buf.Bytes(), extra) + if err != nil { + t.Fatalf("err: %v", err) + } + + cmp := bytes.Compare(msg, plaintext) + if cmp != 0 { + t.Errorf("len %d %v", len(msg), msg) + t.Errorf("len %d %v", len(plaintext), plaintext) + t.Fatalf("encrypt/decrypt failed! %d '%s' '%s'", cmp, msg, plaintext) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/state.go b/vendor/github.com/hashicorp/memberlist/state.go new file mode 100644 index 000000000..8513361b1 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/state.go @@ -0,0 +1,1165 @@ +package memberlist + +import ( + "bytes" + "fmt" + "math" + "math/rand" + "net" + "sync/atomic" + "time" + + "github.com/armon/go-metrics" +) + +type nodeStateType int + +const ( + stateAlive nodeStateType = iota + stateSuspect + stateDead +) + +// Node represents a node in the cluster. +type Node struct { + Name string + Addr net.IP + Port uint16 + Meta []byte // Metadata from the delegate for this node. + PMin uint8 // Minimum protocol version this understands + PMax uint8 // Maximum protocol version this understands + PCur uint8 // Current version node is speaking + DMin uint8 // Min protocol version for the delegate to understand + DMax uint8 // Max protocol version for the delegate to understand + DCur uint8 // Current version delegate is speaking +} + +// Address returns the host:port form of a node's address, suitable for use +// with a transport. +func (n *Node) Address() string { + return joinHostPort(n.Addr.String(), n.Port) +} + +// String returns the node name +func (n *Node) String() string { + return n.Name +} + +// NodeState is used to manage our state view of another node +type nodeState struct { + Node + Incarnation uint32 // Last known incarnation number + State nodeStateType // Current state + StateChange time.Time // Time last state change happened +} + +// Address returns the host:port form of a node's address, suitable for use +// with a transport. +func (n *nodeState) Address() string { + return n.Node.Address() +} + +// ackHandler is used to register handlers for incoming acks and nacks. +type ackHandler struct { + ackFn func([]byte, time.Time) + nackFn func() + timer *time.Timer +} + +// NoPingResponseError is used to indicate a 'ping' packet was +// successfully issued but no response was received +type NoPingResponseError struct { + node string +} + +func (f NoPingResponseError) Error() string { + return fmt.Sprintf("No response from node %s", f.node) +} + +// Schedule is used to ensure the Tick is performed periodically. This +// function is safe to call multiple times. If the memberlist is already +// scheduled, then it won't do anything. +func (m *Memberlist) schedule() { + m.tickerLock.Lock() + defer m.tickerLock.Unlock() + + // If we already have tickers, then don't do anything, since we're + // scheduled + if len(m.tickers) > 0 { + return + } + + // Create the stop tick channel, a blocking channel. We close this + // when we should stop the tickers. + stopCh := make(chan struct{}) + + // Create a new probeTicker + if m.config.ProbeInterval > 0 { + t := time.NewTicker(m.config.ProbeInterval) + go m.triggerFunc(m.config.ProbeInterval, t.C, stopCh, m.probe) + m.tickers = append(m.tickers, t) + } + + // Create a push pull ticker if needed + if m.config.PushPullInterval > 0 { + go m.pushPullTrigger(stopCh) + } + + // Create a gossip ticker if needed + if m.config.GossipInterval > 0 && m.config.GossipNodes > 0 { + t := time.NewTicker(m.config.GossipInterval) + go m.triggerFunc(m.config.GossipInterval, t.C, stopCh, m.gossip) + m.tickers = append(m.tickers, t) + } + + // If we made any tickers, then record the stopTick channel for + // later. + if len(m.tickers) > 0 { + m.stopTick = stopCh + } +} + +// triggerFunc is used to trigger a function call each time a +// message is received until a stop tick arrives. +func (m *Memberlist) triggerFunc(stagger time.Duration, C <-chan time.Time, stop <-chan struct{}, f func()) { + // Use a random stagger to avoid syncronizing + randStagger := time.Duration(uint64(rand.Int63()) % uint64(stagger)) + select { + case <-time.After(randStagger): + case <-stop: + return + } + for { + select { + case <-C: + f() + case <-stop: + return + } + } +} + +// pushPullTrigger is used to periodically trigger a push/pull until +// a stop tick arrives. We don't use triggerFunc since the push/pull +// timer is dynamically scaled based on cluster size to avoid network +// saturation +func (m *Memberlist) pushPullTrigger(stop <-chan struct{}) { + interval := m.config.PushPullInterval + + // Use a random stagger to avoid syncronizing + randStagger := time.Duration(uint64(rand.Int63()) % uint64(interval)) + select { + case <-time.After(randStagger): + case <-stop: + return + } + + // Tick using a dynamic timer + for { + tickTime := pushPullScale(interval, m.estNumNodes()) + select { + case <-time.After(tickTime): + m.pushPull() + case <-stop: + return + } + } +} + +// Deschedule is used to stop the background maintenance. This is safe +// to call multiple times. +func (m *Memberlist) deschedule() { + m.tickerLock.Lock() + defer m.tickerLock.Unlock() + + // If we have no tickers, then we aren't scheduled. + if len(m.tickers) == 0 { + return + } + + // Close the stop channel so all the ticker listeners stop. + close(m.stopTick) + + // Explicitly stop all the tickers themselves so they don't take + // up any more resources, and get rid of the list. + for _, t := range m.tickers { + t.Stop() + } + m.tickers = nil +} + +// Tick is used to perform a single round of failure detection and gossip +func (m *Memberlist) probe() { + // Track the number of indexes we've considered probing + numCheck := 0 +START: + m.nodeLock.RLock() + + // Make sure we don't wrap around infinitely + if numCheck >= len(m.nodes) { + m.nodeLock.RUnlock() + return + } + + // Handle the wrap around case + if m.probeIndex >= len(m.nodes) { + m.nodeLock.RUnlock() + m.resetNodes() + m.probeIndex = 0 + numCheck++ + goto START + } + + // Determine if we should probe this node + skip := false + var node nodeState + + node = *m.nodes[m.probeIndex] + if node.Name == m.config.Name { + skip = true + } else if node.State == stateDead { + skip = true + } + + // Potentially skip + m.nodeLock.RUnlock() + m.probeIndex++ + if skip { + numCheck++ + goto START + } + + // Probe the specific node + m.probeNode(&node) +} + +// probeNode handles a single round of failure checking on a node. +func (m *Memberlist) probeNode(node *nodeState) { + defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now()) + + // We use our health awareness to scale the overall probe interval, so we + // slow down if we detect problems. The ticker that calls us can handle + // us running over the base interval, and will skip missed ticks. + probeInterval := m.awareness.ScaleTimeout(m.config.ProbeInterval) + if probeInterval > m.config.ProbeInterval { + metrics.IncrCounter([]string{"memberlist", "degraded", "probe"}, 1) + } + + // Prepare a ping message and setup an ack handler. + ping := ping{SeqNo: m.nextSeqNo(), Node: node.Name} + ackCh := make(chan ackMessage, m.config.IndirectChecks+1) + nackCh := make(chan struct{}, m.config.IndirectChecks+1) + m.setProbeChannels(ping.SeqNo, ackCh, nackCh, probeInterval) + + // Send a ping to the node. If this node looks like it's suspect or dead, + // also tack on a suspect message so that it has a chance to refute as + // soon as possible. + deadline := time.Now().Add(probeInterval) + addr := node.Address() + if node.State == stateAlive { + if err := m.encodeAndSendMsg(addr, pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err) + return + } + } else { + var msgs [][]byte + if buf, err := encode(pingMsg, &ping); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode ping message: %s", err) + return + } else { + msgs = append(msgs, buf.Bytes()) + } + s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} + if buf, err := encode(suspectMsg, &s); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to encode suspect message: %s", err) + return + } else { + msgs = append(msgs, buf.Bytes()) + } + + compound := makeCompoundMessage(msgs) + if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", addr, err) + return + } + } + + // Mark the sent time here, which should be after any pre-processing and + // system calls to do the actual send. This probably under-reports a bit, + // but it's the best we can do. + sent := time.Now() + + // Arrange for our self-awareness to get updated. At this point we've + // sent the ping, so any return statement means the probe succeeded + // which will improve our health until we get to the failure scenarios + // at the end of this function, which will alter this delta variable + // accordingly. + awarenessDelta := -1 + defer func() { + m.awareness.ApplyDelta(awarenessDelta) + }() + + // Wait for response or round-trip-time. + select { + case v := <-ackCh: + if v.Complete == true { + if m.config.Ping != nil { + rtt := v.Timestamp.Sub(sent) + m.config.Ping.NotifyPingComplete(&node.Node, rtt, v.Payload) + } + return + } + + // As an edge case, if we get a timeout, we need to re-enqueue it + // here to break out of the select below. + if v.Complete == false { + ackCh <- v + } + case <-time.After(m.config.ProbeTimeout): + // Note that we don't scale this timeout based on awareness and + // the health score. That's because we don't really expect waiting + // longer to help get UDP through. Since health does extend the + // probe interval it will give the TCP fallback more time, which + // is more active in dealing with lost packets, and it gives more + // time to wait for indirect acks/nacks. + m.logger.Printf("[DEBUG] memberlist: Failed ping: %v (timeout reached)", node.Name) + } + + // Get some random live nodes. + m.nodeLock.RLock() + kNodes := kRandomNodes(m.config.IndirectChecks, m.nodes, func(n *nodeState) bool { + return n.Name == m.config.Name || + n.Name == node.Name || + n.State != stateAlive + }) + m.nodeLock.RUnlock() + + // Attempt an indirect ping. + expectedNacks := 0 + ind := indirectPingReq{SeqNo: ping.SeqNo, Target: node.Addr, Port: node.Port, Node: node.Name} + for _, peer := range kNodes { + // We only expect nack to be sent from peers who understand + // version 4 of the protocol. + if ind.Nack = peer.PMax >= 4; ind.Nack { + expectedNacks++ + } + + if err := m.encodeAndSendMsg(peer.Address(), indirectPingMsg, &ind); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send indirect ping: %s", err) + } + } + + // Also make an attempt to contact the node directly over TCP. This + // helps prevent confused clients who get isolated from UDP traffic + // but can still speak TCP (which also means they can possibly report + // misinformation to other nodes via anti-entropy), avoiding flapping in + // the cluster. + // + // This is a little unusual because we will attempt a TCP ping to any + // member who understands version 3 of the protocol, regardless of + // which protocol version we are speaking. That's why we've included a + // config option to turn this off if desired. + fallbackCh := make(chan bool, 1) + if (!m.config.DisableTcpPings) && (node.PMax >= 3) { + go func() { + defer close(fallbackCh) + didContact, err := m.sendPingAndWaitForAck(node.Address(), ping, deadline) + if err != nil { + m.logger.Printf("[ERR] memberlist: Failed fallback ping: %s", err) + } else { + fallbackCh <- didContact + } + }() + } else { + close(fallbackCh) + } + + // Wait for the acks or timeout. Note that we don't check the fallback + // channel here because we want to issue a warning below if that's the + // *only* way we hear back from the peer, so we have to let this time + // out first to allow the normal UDP-based acks to come in. + select { + case v := <-ackCh: + if v.Complete == true { + return + } + } + + // Finally, poll the fallback channel. The timeouts are set such that + // the channel will have something or be closed without having to wait + // any additional time here. + for didContact := range fallbackCh { + if didContact { + m.logger.Printf("[WARN] memberlist: Was able to connect to %s but other probes failed, network may be misconfigured", node.Name) + return + } + } + + // Update our self-awareness based on the results of this failed probe. + // If we don't have peers who will send nacks then we penalize for any + // failed probe as a simple health metric. If we do have peers to nack + // verify, then we can use that as a more sophisticated measure of self- + // health because we assume them to be working, and they can help us + // decide if the probed node was really dead or if it was something wrong + // with ourselves. + awarenessDelta = 0 + if expectedNacks > 0 { + if nackCount := len(nackCh); nackCount < expectedNacks { + awarenessDelta += (expectedNacks - nackCount) + } + } else { + awarenessDelta += 1 + } + + // No acks received from target, suspect it as failed. + m.logger.Printf("[INFO] memberlist: Suspect %s has failed, no acks received", node.Name) + s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name} + m.suspectNode(&s) +} + +// Ping initiates a ping to the node with the specified name. +func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) { + // Prepare a ping message and setup an ack handler. + ping := ping{SeqNo: m.nextSeqNo(), Node: node} + ackCh := make(chan ackMessage, m.config.IndirectChecks+1) + m.setProbeChannels(ping.SeqNo, ackCh, nil, m.config.ProbeInterval) + + // Send a ping to the node. + if err := m.encodeAndSendMsg(addr.String(), pingMsg, &ping); err != nil { + return 0, err + } + + // Mark the sent time here, which should be after any pre-processing and + // system calls to do the actual send. This probably under-reports a bit, + // but it's the best we can do. + sent := time.Now() + + // Wait for response or timeout. + select { + case v := <-ackCh: + if v.Complete == true { + return v.Timestamp.Sub(sent), nil + } + case <-time.After(m.config.ProbeTimeout): + // Timeout, return an error below. + } + + m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node) + return 0, NoPingResponseError{ping.Node} +} + +// resetNodes is used when the tick wraps around. It will reap the +// dead nodes and shuffle the node list. +func (m *Memberlist) resetNodes() { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + + // Move dead nodes, but respect gossip to the dead interval + deadIdx := moveDeadNodes(m.nodes, m.config.GossipToTheDeadTime) + + // Deregister the dead nodes + for i := deadIdx; i < len(m.nodes); i++ { + delete(m.nodeMap, m.nodes[i].Name) + m.nodes[i] = nil + } + + // Trim the nodes to exclude the dead nodes + m.nodes = m.nodes[0:deadIdx] + + // Update numNodes after we've trimmed the dead nodes + atomic.StoreUint32(&m.numNodes, uint32(deadIdx)) + + // Shuffle live nodes + shuffleNodes(m.nodes) +} + +// gossip is invoked every GossipInterval period to broadcast our gossip +// messages to a few random nodes. +func (m *Memberlist) gossip() { + defer metrics.MeasureSince([]string{"memberlist", "gossip"}, time.Now()) + + // Get some random live, suspect, or recently dead nodes + m.nodeLock.RLock() + kNodes := kRandomNodes(m.config.GossipNodes, m.nodes, func(n *nodeState) bool { + if n.Name == m.config.Name { + return true + } + + switch n.State { + case stateAlive, stateSuspect: + return false + + case stateDead: + return time.Since(n.StateChange) > m.config.GossipToTheDeadTime + + default: + return true + } + }) + m.nodeLock.RUnlock() + + // Compute the bytes available + bytesAvail := m.config.UDPBufferSize - compoundHeaderOverhead + if m.config.EncryptionEnabled() { + bytesAvail -= encryptOverhead(m.encryptionVersion()) + } + + for _, node := range kNodes { + // Get any pending broadcasts + msgs := m.getBroadcasts(compoundOverhead, bytesAvail) + if len(msgs) == 0 { + return + } + + addr := node.Address() + if len(msgs) == 1 { + // Send single message as is + if err := m.rawSendMsgPacket(addr, &node.Node, msgs[0]); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) + } + } else { + // Otherwise create and send a compound message + compound := makeCompoundMessage(msgs) + if err := m.rawSendMsgPacket(addr, &node.Node, compound.Bytes()); err != nil { + m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err) + } + } + } +} + +// pushPull is invoked periodically to randomly perform a complete state +// exchange. Used to ensure a high level of convergence, but is also +// reasonably expensive as the entire state of this node is exchanged +// with the other node. +func (m *Memberlist) pushPull() { + // Get a random live node + m.nodeLock.RLock() + nodes := kRandomNodes(1, m.nodes, func(n *nodeState) bool { + return n.Name == m.config.Name || + n.State != stateAlive + }) + m.nodeLock.RUnlock() + + // If no nodes, bail + if len(nodes) == 0 { + return + } + node := nodes[0] + + // Attempt a push pull + if err := m.pushPullNode(node.Address(), false); err != nil { + m.logger.Printf("[ERR] memberlist: Push/Pull with %s failed: %s", node.Name, err) + } +} + +// pushPullNode does a complete state exchange with a specific node. +func (m *Memberlist) pushPullNode(addr string, join bool) error { + defer metrics.MeasureSince([]string{"memberlist", "pushPullNode"}, time.Now()) + + // Attempt to send and receive with the node + remote, userState, err := m.sendAndReceiveState(addr, join) + if err != nil { + return err + } + + if err := m.mergeRemoteState(join, remote, userState); err != nil { + return err + } + return nil +} + +// verifyProtocol verifies that all the remote nodes can speak with our +// nodes and vice versa on both the core protocol as well as the +// delegate protocol level. +// +// The verification works by finding the maximum minimum and +// minimum maximum understood protocol and delegate versions. In other words, +// it finds the common denominator of protocol and delegate version ranges +// for the entire cluster. +// +// After this, it goes through the entire cluster (local and remote) and +// verifies that everyone's speaking protocol versions satisfy this range. +// If this passes, it means that every node can understand each other. +func (m *Memberlist) verifyProtocol(remote []pushNodeState) error { + m.nodeLock.RLock() + defer m.nodeLock.RUnlock() + + // Maximum minimum understood and minimum maximum understood for both + // the protocol and delegate versions. We use this to verify everyone + // can be understood. + var maxpmin, minpmax uint8 + var maxdmin, mindmax uint8 + minpmax = math.MaxUint8 + mindmax = math.MaxUint8 + + for _, rn := range remote { + // If the node isn't alive, then skip it + if rn.State != stateAlive { + continue + } + + // Skip nodes that don't have versions set, it just means + // their version is zero. + if len(rn.Vsn) == 0 { + continue + } + + if rn.Vsn[0] > maxpmin { + maxpmin = rn.Vsn[0] + } + + if rn.Vsn[1] < minpmax { + minpmax = rn.Vsn[1] + } + + if rn.Vsn[3] > maxdmin { + maxdmin = rn.Vsn[3] + } + + if rn.Vsn[4] < mindmax { + mindmax = rn.Vsn[4] + } + } + + for _, n := range m.nodes { + // Ignore non-alive nodes + if n.State != stateAlive { + continue + } + + if n.PMin > maxpmin { + maxpmin = n.PMin + } + + if n.PMax < minpmax { + minpmax = n.PMax + } + + if n.DMin > maxdmin { + maxdmin = n.DMin + } + + if n.DMax < mindmax { + mindmax = n.DMax + } + } + + // Now that we definitively know the minimum and maximum understood + // version that satisfies the whole cluster, we verify that every + // node in the cluster satisifies this. + for _, n := range remote { + var nPCur, nDCur uint8 + if len(n.Vsn) > 0 { + nPCur = n.Vsn[2] + nDCur = n.Vsn[5] + } + + if nPCur < maxpmin || nPCur > minpmax { + return fmt.Errorf( + "Node '%s' protocol version (%d) is incompatible: [%d, %d]", + n.Name, nPCur, maxpmin, minpmax) + } + + if nDCur < maxdmin || nDCur > mindmax { + return fmt.Errorf( + "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", + n.Name, nDCur, maxdmin, mindmax) + } + } + + for _, n := range m.nodes { + nPCur := n.PCur + nDCur := n.DCur + + if nPCur < maxpmin || nPCur > minpmax { + return fmt.Errorf( + "Node '%s' protocol version (%d) is incompatible: [%d, %d]", + n.Name, nPCur, maxpmin, minpmax) + } + + if nDCur < maxdmin || nDCur > mindmax { + return fmt.Errorf( + "Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]", + n.Name, nDCur, maxdmin, mindmax) + } + } + + return nil +} + +// nextSeqNo returns a usable sequence number in a thread safe way +func (m *Memberlist) nextSeqNo() uint32 { + return atomic.AddUint32(&m.sequenceNum, 1) +} + +// nextIncarnation returns the next incarnation number in a thread safe way +func (m *Memberlist) nextIncarnation() uint32 { + return atomic.AddUint32(&m.incarnation, 1) +} + +// skipIncarnation adds the positive offset to the incarnation number. +func (m *Memberlist) skipIncarnation(offset uint32) uint32 { + return atomic.AddUint32(&m.incarnation, offset) +} + +// estNumNodes is used to get the current estimate of the number of nodes +func (m *Memberlist) estNumNodes() int { + return int(atomic.LoadUint32(&m.numNodes)) +} + +type ackMessage struct { + Complete bool + Payload []byte + Timestamp time.Time +} + +// setProbeChannels is used to attach the ackCh to receive a message when an ack +// with a given sequence number is received. The `complete` field of the message +// will be false on timeout. Any nack messages will cause an empty struct to be +// passed to the nackCh, which can be nil if not needed. +func (m *Memberlist) setProbeChannels(seqNo uint32, ackCh chan ackMessage, nackCh chan struct{}, timeout time.Duration) { + // Create handler functions for acks and nacks + ackFn := func(payload []byte, timestamp time.Time) { + select { + case ackCh <- ackMessage{true, payload, timestamp}: + default: + } + } + nackFn := func() { + select { + case nackCh <- struct{}{}: + default: + } + } + + // Add the handlers + ah := &ackHandler{ackFn, nackFn, nil} + m.ackLock.Lock() + m.ackHandlers[seqNo] = ah + m.ackLock.Unlock() + + // Setup a reaping routing + ah.timer = time.AfterFunc(timeout, func() { + m.ackLock.Lock() + delete(m.ackHandlers, seqNo) + m.ackLock.Unlock() + select { + case ackCh <- ackMessage{false, nil, time.Now()}: + default: + } + }) +} + +// setAckHandler is used to attach a handler to be invoked when an ack with a +// given sequence number is received. If a timeout is reached, the handler is +// deleted. This is used for indirect pings so does not configure a function +// for nacks. +func (m *Memberlist) setAckHandler(seqNo uint32, ackFn func([]byte, time.Time), timeout time.Duration) { + // Add the handler + ah := &ackHandler{ackFn, nil, nil} + m.ackLock.Lock() + m.ackHandlers[seqNo] = ah + m.ackLock.Unlock() + + // Setup a reaping routing + ah.timer = time.AfterFunc(timeout, func() { + m.ackLock.Lock() + delete(m.ackHandlers, seqNo) + m.ackLock.Unlock() + }) +} + +// Invokes an ack handler if any is associated, and reaps the handler immediately +func (m *Memberlist) invokeAckHandler(ack ackResp, timestamp time.Time) { + m.ackLock.Lock() + ah, ok := m.ackHandlers[ack.SeqNo] + delete(m.ackHandlers, ack.SeqNo) + m.ackLock.Unlock() + if !ok { + return + } + ah.timer.Stop() + ah.ackFn(ack.Payload, timestamp) +} + +// Invokes nack handler if any is associated. +func (m *Memberlist) invokeNackHandler(nack nackResp) { + m.ackLock.Lock() + ah, ok := m.ackHandlers[nack.SeqNo] + m.ackLock.Unlock() + if !ok || ah.nackFn == nil { + return + } + ah.nackFn() +} + +// refute gossips an alive message in response to incoming information that we +// are suspect or dead. It will make sure the incarnation number beats the given +// accusedInc value, or you can supply 0 to just get the next incarnation number. +// This alters the node state that's passed in so this MUST be called while the +// nodeLock is held. +func (m *Memberlist) refute(me *nodeState, accusedInc uint32) { + // Make sure the incarnation number beats the accusation. + inc := m.nextIncarnation() + if accusedInc >= inc { + inc = m.skipIncarnation(accusedInc - inc + 1) + } + me.Incarnation = inc + + // Decrease our health because we are being asked to refute a problem. + m.awareness.ApplyDelta(1) + + // Format and broadcast an alive message. + a := alive{ + Incarnation: inc, + Node: me.Name, + Addr: me.Addr, + Port: me.Port, + Meta: me.Meta, + Vsn: []uint8{ + me.PMin, me.PMax, me.PCur, + me.DMin, me.DMax, me.DCur, + }, + } + m.encodeAndBroadcast(me.Addr.String(), aliveMsg, a) +} + +// aliveNode is invoked by the network layer when we get a message about a +// live node. +func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[a.Node] + + // It is possible that during a Leave(), there is already an aliveMsg + // in-queue to be processed but blocked by the locks above. If we let + // that aliveMsg process, it'll cause us to re-join the cluster. This + // ensures that we don't. + if m.leave && a.Node == m.config.Name { + return + } + + // Invoke the Alive delegate if any. This can be used to filter out + // alive messages based on custom logic. For example, using a cluster name. + // Using a merge delegate is not enough, as it is possible for passive + // cluster merging to still occur. + if m.config.Alive != nil { + node := &Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + PMin: a.Vsn[0], + PMax: a.Vsn[1], + PCur: a.Vsn[2], + DMin: a.Vsn[3], + DMax: a.Vsn[4], + DCur: a.Vsn[5], + } + if err := m.config.Alive.NotifyAlive(node); err != nil { + m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s': %s", + a.Node, err) + return + } + } + + // Check if we've never seen this node before, and if not, then + // store this node in our node map. + if !ok { + state = &nodeState{ + Node: Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + }, + State: stateDead, + } + + // Add to map + m.nodeMap[a.Node] = state + + // Get a random offset. This is important to ensure + // the failure detection bound is low on average. If all + // nodes did an append, failure detection bound would be + // very high. + n := len(m.nodes) + offset := randomOffset(n) + + // Add at the end and swap with the node at the offset + m.nodes = append(m.nodes, state) + m.nodes[offset], m.nodes[n] = m.nodes[n], m.nodes[offset] + + // Update numNodes after we've added a new node + atomic.AddUint32(&m.numNodes, 1) + } + + // Check if this address is different than the existing node + if !bytes.Equal([]byte(state.Addr), a.Addr) || state.Port != a.Port { + m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d", + state.Name, state.Addr, state.Port, net.IP(a.Addr), a.Port) + + // Inform the conflict delegate if provided + if m.config.Conflict != nil { + other := Node{ + Name: a.Node, + Addr: a.Addr, + Port: a.Port, + Meta: a.Meta, + } + m.config.Conflict.NotifyConflict(&state.Node, &other) + } + return + } + + // Bail if the incarnation number is older, and this is not about us + isLocalNode := state.Name == m.config.Name + if a.Incarnation <= state.Incarnation && !isLocalNode { + return + } + + // Bail if strictly less and this is about us + if a.Incarnation < state.Incarnation && isLocalNode { + return + } + + // Clear out any suspicion timer that may be in effect. + delete(m.nodeTimers, a.Node) + + // Store the old state and meta data + oldState := state.State + oldMeta := state.Meta + + // If this is us we need to refute, otherwise re-broadcast + if !bootstrap && isLocalNode { + // Compute the version vector + versions := []uint8{ + state.PMin, state.PMax, state.PCur, + state.DMin, state.DMax, state.DCur, + } + + // If the Incarnation is the same, we need special handling, since it + // possible for the following situation to happen: + // 1) Start with configuration C, join cluster + // 2) Hard fail / Kill / Shutdown + // 3) Restart with configuration C', join cluster + // + // In this case, other nodes and the local node see the same incarnation, + // but the values may not be the same. For this reason, we always + // need to do an equality check for this Incarnation. In most cases, + // we just ignore, but we may need to refute. + // + if a.Incarnation == state.Incarnation && + bytes.Equal(a.Meta, state.Meta) && + bytes.Equal(a.Vsn, versions) { + return + } + + m.refute(state, a.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting an alive message") + } else { + m.encodeBroadcastNotify(a.Node, aliveMsg, a, notify) + + // Update protocol versions if it arrived + if len(a.Vsn) > 0 { + state.PMin = a.Vsn[0] + state.PMax = a.Vsn[1] + state.PCur = a.Vsn[2] + state.DMin = a.Vsn[3] + state.DMax = a.Vsn[4] + state.DCur = a.Vsn[5] + } + + // Update the state and incarnation number + state.Incarnation = a.Incarnation + state.Meta = a.Meta + if state.State != stateAlive { + state.State = stateAlive + state.StateChange = time.Now() + } + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1) + + // Notify the delegate of any relevant updates + if m.config.Events != nil { + if oldState == stateDead { + // if Dead -> Alive, notify of join + m.config.Events.NotifyJoin(&state.Node) + + } else if !bytes.Equal(oldMeta, state.Meta) { + // if Meta changed, trigger an update notification + m.config.Events.NotifyUpdate(&state.Node) + } + } +} + +// suspectNode is invoked by the network layer when we get a message +// about a suspect node +func (m *Memberlist) suspectNode(s *suspect) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[s.Node] + + // If we've never heard about this node before, ignore it + if !ok { + return + } + + // Ignore old incarnation numbers + if s.Incarnation < state.Incarnation { + return + } + + // See if there's a suspicion timer we can confirm. If the info is new + // to us we will go ahead and re-gossip it. This allows for multiple + // independent confirmations to flow even when a node probes a node + // that's already suspect. + if timer, ok := m.nodeTimers[s.Node]; ok { + if timer.Confirm(s.From) { + m.encodeAndBroadcast(s.Node, suspectMsg, s) + } + return + } + + // Ignore non-alive nodes + if state.State != stateAlive { + return + } + + // If this is us we need to refute, otherwise re-broadcast + if state.Name == m.config.Name { + m.refute(state, s.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting a suspect message (from: %s)", s.From) + return // Do not mark ourself suspect + } else { + m.encodeAndBroadcast(s.Node, suspectMsg, s) + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "suspect"}, 1) + + // Update the state + state.Incarnation = s.Incarnation + state.State = stateSuspect + changeTime := time.Now() + state.StateChange = changeTime + + // Setup a suspicion timer. Given that we don't have any known phase + // relationship with our peers, we set up k such that we hit the nominal + // timeout two probe intervals short of what we expect given the suspicion + // multiplier. + k := m.config.SuspicionMult - 2 + + // If there aren't enough nodes to give the expected confirmations, just + // set k to 0 to say that we don't expect any. Note we subtract 2 from n + // here to take out ourselves and the node being probed. + n := m.estNumNodes() + if n-2 < k { + k = 0 + } + + // Compute the timeouts based on the size of the cluster. + min := suspicionTimeout(m.config.SuspicionMult, n, m.config.ProbeInterval) + max := time.Duration(m.config.SuspicionMaxTimeoutMult) * min + fn := func(numConfirmations int) { + m.nodeLock.Lock() + state, ok := m.nodeMap[s.Node] + timeout := ok && state.State == stateSuspect && state.StateChange == changeTime + m.nodeLock.Unlock() + + if timeout { + if k > 0 && numConfirmations < k { + metrics.IncrCounter([]string{"memberlist", "degraded", "timeout"}, 1) + } + + m.logger.Printf("[INFO] memberlist: Marking %s as failed, suspect timeout reached (%d peer confirmations)", + state.Name, numConfirmations) + d := dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name} + m.deadNode(&d) + } + } + m.nodeTimers[s.Node] = newSuspicion(s.From, k, min, max, fn) +} + +// deadNode is invoked by the network layer when we get a message +// about a dead node +func (m *Memberlist) deadNode(d *dead) { + m.nodeLock.Lock() + defer m.nodeLock.Unlock() + state, ok := m.nodeMap[d.Node] + + // If we've never heard about this node before, ignore it + if !ok { + return + } + + // Ignore old incarnation numbers + if d.Incarnation < state.Incarnation { + return + } + + // Clear out any suspicion timer that may be in effect. + delete(m.nodeTimers, d.Node) + + // Ignore if node is already dead + if state.State == stateDead { + return + } + + // Check if this is us + if state.Name == m.config.Name { + // If we are not leaving we need to refute + if !m.leave { + m.refute(state, d.Incarnation) + m.logger.Printf("[WARN] memberlist: Refuting a dead message (from: %s)", d.From) + return // Do not mark ourself dead + } + + // If we are leaving, we broadcast and wait + m.encodeBroadcastNotify(d.Node, deadMsg, d, m.leaveBroadcast) + } else { + m.encodeAndBroadcast(d.Node, deadMsg, d) + } + + // Update metrics + metrics.IncrCounter([]string{"memberlist", "msg", "dead"}, 1) + + // Update the state + state.Incarnation = d.Incarnation + state.State = stateDead + state.StateChange = time.Now() + + // Notify of death + if m.config.Events != nil { + m.config.Events.NotifyLeave(&state.Node) + } +} + +// mergeState is invoked by the network layer when we get a Push/Pull +// state transfer +func (m *Memberlist) mergeState(remote []pushNodeState) { + for _, r := range remote { + switch r.State { + case stateAlive: + a := alive{ + Incarnation: r.Incarnation, + Node: r.Name, + Addr: r.Addr, + Port: r.Port, + Meta: r.Meta, + Vsn: r.Vsn, + } + m.aliveNode(&a, nil, false) + + case stateDead: + // If the remote node believes a node is dead, we prefer to + // suspect that node instead of declaring it dead instantly + fallthrough + case stateSuspect: + s := suspect{Incarnation: r.Incarnation, Node: r.Name, From: m.config.Name} + m.suspectNode(&s) + } + } +} diff --git a/vendor/github.com/hashicorp/memberlist/state_test.go b/vendor/github.com/hashicorp/memberlist/state_test.go new file mode 100644 index 000000000..8b9c8aaf7 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/state_test.go @@ -0,0 +1,1900 @@ +package memberlist + +import ( + "bytes" + "fmt" + "net" + "testing" + "time" +) + +func HostMemberlist(host string, t *testing.T, f func(*Config)) *Memberlist { + c := DefaultLANConfig() + c.Name = host + c.BindAddr = host + if f != nil { + f(c) + } + + m, err := newMemberlist(c) + if err != nil { + t.Fatalf("failed to get memberlist: %s", err) + } + return m +} + +func TestMemberList_Probe(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 10 * time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, nil) + + a1 := alive{ + Node: addr1.String(), + Addr: []byte(addr1), + Port: uint16(m1.config.BindPort), + Incarnation: 1, + } + m1.aliveNode(&a1, nil, true) + a2 := alive{ + Node: addr2.String(), + Addr: []byte(addr2), + Port: uint16(m2.config.BindPort), + Incarnation: 1, + } + m1.aliveNode(&a2, nil, false) + + // should ping addr2 + m1.probe() + + // Should not be marked suspect + n := m1.nodeMap[addr2.String()] + if n.State != stateAlive { + t.Fatalf("Expect node to be alive") + } + + // Should increment seqno + if m1.sequenceNum != 1 { + t.Fatalf("bad seqno %v", m2.sequenceNum) + } +} + +func TestMemberList_ProbeNode_Suspect(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 10 * time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, nil) + m3 := HostMemberlist(addr3.String(), t, nil) + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1} + m1.aliveNode(&a4, nil, false) + + n := m1.nodeMap[addr4.String()] + m1.probeNode(n) + + // Should be marked suspect. + if n.State != stateSuspect { + t.Fatalf("Expect node to be suspect") + } + time.Sleep(10 * time.Millisecond) + + // One of the peers should have attempted an indirect probe. + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } +} + +func TestMemberList_ProbeNode_Suspect_Dogpile(t *testing.T) { + cases := []struct { + numPeers int + confirmations int + expected time.Duration + }{ + {1, 0, 500 * time.Millisecond}, // n=2, k=3 (max timeout disabled) + {2, 0, 500 * time.Millisecond}, // n=3, k=3 + {3, 0, 500 * time.Millisecond}, // n=4, k=3 + {4, 0, 1000 * time.Millisecond}, // n=5, k=3 (max timeout starts to take effect) + {5, 0, 1000 * time.Millisecond}, // n=6, k=3 + {5, 1, 750 * time.Millisecond}, // n=6, k=3 (confirmations start to lower timeout) + {5, 2, 604 * time.Millisecond}, // n=6, k=3 + {5, 3, 500 * time.Millisecond}, // n=6, k=3 (timeout driven to nominal value) + {5, 4, 500 * time.Millisecond}, // n=6, k=3 + } + for i, c := range cases { + // Create the main memberlist under test. + addr := getBindAddr() + m := HostMemberlist(addr.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 100 * time.Millisecond + c.SuspicionMult = 5 + c.SuspicionMaxTimeoutMult = 2 + }) + a := alive{Node: addr.String(), Addr: []byte(addr), Port: 7946, Incarnation: 1} + m.aliveNode(&a, nil, true) + + // Make all but one peer be an real, alive instance. + var peers []*Memberlist + for j := 0; j < c.numPeers-1; j++ { + peerAddr := getBindAddr() + peers = append(peers, HostMemberlist(peerAddr.String(), t, nil)) + a = alive{Node: peerAddr.String(), Addr: []byte(peerAddr), Port: 7946, Incarnation: 1} + m.aliveNode(&a, nil, false) + } + + // Just use a bogus address for the last peer so it doesn't respond + // to pings, but tell the memberlist it's alive. + badPeerAddr := getBindAddr() + a = alive{Node: badPeerAddr.String(), Addr: []byte(badPeerAddr), Port: 7946, Incarnation: 1} + m.aliveNode(&a, nil, false) + + // Force a probe, which should start us into the suspect state. + n := m.nodeMap[badPeerAddr.String()] + m.probeNode(n) + if n.State != stateSuspect { + t.Fatalf("case %d: expected node to be suspect", i) + } + + // Add the requested number of confirmations. + for j := 0; j < c.confirmations; j++ { + from := fmt.Sprintf("peer%d", j) + s := suspect{Node: badPeerAddr.String(), Incarnation: 1, From: from} + m.suspectNode(&s) + } + + // Wait until right before the timeout and make sure the timer + // hasn't fired. + fudge := 25 * time.Millisecond + time.Sleep(c.expected - fudge) + if n.State != stateSuspect { + t.Fatalf("case %d: expected node to still be suspect", i) + } + + // Wait through the timeout and a little after to make sure the + // timer fires. + time.Sleep(2 * fudge) + if n.State != stateDead { + t.Fatalf("case %d: expected node to be dead", i) + } + } +} + +/* +func TestMemberList_ProbeNode_FallbackTCP(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMax time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMax = c.ProbeInterval + 20*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, nil) + defer m2.Shutdown() + + m3 := HostMemberlist(addr3.String(), t, nil) + defer m3.Shutdown() + + m4 := HostMemberlist(addr4.String(), t, nil) + defer m4.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + + // Make sure m4 is configured with the same protocol version as m1 so + // the TCP fallback behavior is enabled. + a4 := alive{ + Node: addr4.String(), + Addr: ip4, + Port: 7946, + Incarnation: 1, + Vsn: []uint8{ + ProtocolVersionMin, + ProtocolVersionMax, + m1.config.ProtocolVersion, + m1.config.DelegateProtocolMin, + m1.config.DelegateProtocolMax, + m1.config.DelegateProtocolVersion, + }, + } + m1.aliveNode(&a4, nil, false) + + // Isolate m4 from UDP traffic by re-opening its listener on the wrong + // port. This should force the TCP fallback path to be used. + var err error + if err = m4.udpListener.Close(); err != nil { + t.Fatalf("err: %v", err) + } + udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} + if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { + t.Fatalf("err: %v", err) + } + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Should be marked alive because of the TCP fallback ping. + if n.State != stateAlive { + t.Fatalf("expect node to be alive") + } + + // Make sure TCP activity completed in a timely manner. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + time.Sleep(probeTimeMax) + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } + + // Now shutdown all inbound TCP traffic to make sure the TCP fallback + // path properly fails when the node is really unreachable. + if err = m4.tcpListener.Close(); err != nil { + t.Fatalf("err: %v", err) + } + tcpAddr := &net.TCPAddr{IP: ip4, Port: 9999} + if m4.tcpListener, err = net.ListenTCP("tcp", tcpAddr); err != nil { + t.Fatalf("err: %v", err) + } + + // Probe again, this time there should be no contact. + startProbe = time.Now() + m1.probeNode(n) + probeTime = time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure TCP activity didn't cause us to wait too long before + // timing out. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + time.Sleep(probeTimeMax) + if m2.sequenceNum != 2 && m3.sequenceNum != 2 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } +} + +func TestMemberList_ProbeNode_FallbackTCP_Disabled(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMax time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMax = c.ProbeInterval + 20*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, nil) + defer m2.Shutdown() + + m3 := HostMemberlist(addr3.String(), t, nil) + defer m3.Shutdown() + + m4 := HostMemberlist(addr4.String(), t, nil) + defer m4.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + + // Make sure m4 is configured with the same protocol version as m1 so + // the TCP fallback behavior is enabled. + a4 := alive{ + Node: addr4.String(), + Addr: ip4, + Port: 7946, + Incarnation: 1, + Vsn: []uint8{ + ProtocolVersionMin, + ProtocolVersionMax, + m1.config.ProtocolVersion, + m1.config.DelegateProtocolMin, + m1.config.DelegateProtocolMax, + m1.config.DelegateProtocolVersion, + }, + } + m1.aliveNode(&a4, nil, false) + + // Isolate m4 from UDP traffic by re-opening its listener on the wrong + // port. This should force the TCP fallback path to be used. + var err error + if err = m4.udpListener.Close(); err != nil { + t.Fatalf("err: %v", err) + } + udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} + if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { + t.Fatalf("err: %v", err) + } + + // Disable the TCP pings using the config mechanism. + m1.config.DisableTcpPings = true + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure TCP activity didn't cause us to wait too long before + // timing out. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + time.Sleep(probeTimeMax) + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } +} + +func TestMemberList_ProbeNode_FallbackTCP_OldProtocol(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMax time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMax = c.ProbeInterval + 20*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, nil) + defer m2.Shutdown() + + m3 := HostMemberlist(addr3.String(), t, nil) + defer m3.Shutdown() + + m4 := HostMemberlist(addr4.String(), t, nil) + defer m4.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + + // Set up m4 so that it doesn't understand a version of the protocol + // that supports TCP pings. + a4 := alive{ + Node: addr4.String(), + Addr: ip4, + Port: 7946, + Incarnation: 1, + Vsn: []uint8{ + ProtocolVersionMin, + ProtocolVersion2Compatible, + ProtocolVersion2Compatible, + m1.config.DelegateProtocolMin, + m1.config.DelegateProtocolMax, + m1.config.DelegateProtocolVersion, + }, + } + m1.aliveNode(&a4, nil, false) + + // Isolate m4 from UDP traffic by re-opening its listener on the wrong + // port. This should force the TCP fallback path to be used. + var err error + if err = m4.udpListener.Close(); err != nil { + t.Fatalf("err: %v", err) + } + udpAddr := &net.UDPAddr{IP: ip4, Port: 9999} + if m4.udpListener, err = net.ListenUDP("udp", udpAddr); err != nil { + t.Fatalf("err: %v", err) + } + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure TCP activity didn't cause us to wait too long before + // timing out. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + time.Sleep(probeTimeMax) + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } +} +*/ + +func TestMemberList_ProbeNode_Awareness_Degraded(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMin time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMin = 2*c.ProbeInterval - 50*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + }) + defer m2.Shutdown() + + m3 := HostMemberlist(addr3.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + }) + defer m3.Shutdown() + + // This will enable nacks by invoking the latest protocol version. + vsn := []uint8{ + ProtocolVersionMin, + ProtocolVersionMax, + m1.config.ProtocolVersion, + m1.config.DelegateProtocolMin, + m1.config.DelegateProtocolMax, + m1.config.DelegateProtocolVersion, + } + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a3, nil, false) + + // Node 4 never gets started. + a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a4, nil, false) + + // Start the health in a degraded state. + m1.awareness.ApplyDelta(1) + if score := m1.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure we timed out approximately on time (note that we accounted + // for the slowed-down failure detector in the probeTimeMin calculation. + if probeTime < probeTimeMin { + t.Fatalf("probed too quickly, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } + + // We should have gotten all the nacks, so our score should remain the + // same, since we didn't get a successful probe. + if score := m1.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_ProbeNode_Awareness_Improved(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, nil) + defer m2.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + + // Start the health in a degraded state. + m1.awareness.ApplyDelta(1) + if score := m1.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } + + // Have node m1 probe m2. + n := m1.nodeMap[addr2.String()] + m1.probeNode(n) + + // Node should be reported alive. + if n.State != stateAlive { + t.Fatalf("expect node to be suspect") + } + + // Our score should have improved since we did a good probe. + if score := m1.GetHealthScore(); score != 0 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_ProbeNode_Awareness_MissedNack(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMax time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMax = c.ProbeInterval + 50*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + }) + defer m2.Shutdown() + + // This will enable nacks by invoking the latest protocol version. + vsn := []uint8{ + ProtocolVersionMin, + ProtocolVersionMax, + m1.config.ProtocolVersion, + m1.config.DelegateProtocolMin, + m1.config.DelegateProtocolMax, + m1.config.DelegateProtocolVersion, + } + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a2, nil, false) + + // Node 3 and node 4 never get started. + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a3, nil, false) + a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1, Vsn: vsn} + m1.aliveNode(&a4, nil, false) + + // Make sure health looks good. + if score := m1.GetHealthScore(); score != 0 { + t.Fatalf("bad: %d", score) + } + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure we timed out approximately on time. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // We should have gotten dinged for the missed nack. + time.Sleep(probeTimeMax) + if score := m1.GetHealthScore(); score != 2 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_ProbeNode_Awareness_OldProtocol(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + addr3 := getBindAddr() + addr4 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + ip3 := []byte(addr3) + ip4 := []byte(addr4) + + var probeTimeMax time.Duration + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = 10 * time.Millisecond + c.ProbeInterval = 200 * time.Millisecond + probeTimeMax = c.ProbeInterval + 20*time.Millisecond + }) + defer m1.Shutdown() + + m2 := HostMemberlist(addr2.String(), t, nil) + defer m2.Shutdown() + + m3 := HostMemberlist(addr3.String(), t, nil) + defer m3.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: addr3.String(), Addr: ip3, Port: 7946, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + + // Node 4 never gets started. + a4 := alive{Node: addr4.String(), Addr: ip4, Port: 7946, Incarnation: 1} + m1.aliveNode(&a4, nil, false) + + // Make sure health looks good. + if score := m1.GetHealthScore(); score != 0 { + t.Fatalf("bad: %d", score) + } + + // Have node m1 probe m4. + n := m1.nodeMap[addr4.String()] + startProbe := time.Now() + m1.probeNode(n) + probeTime := time.Now().Sub(startProbe) + + // Node should be reported suspect. + if n.State != stateSuspect { + t.Fatalf("expect node to be suspect") + } + + // Make sure we timed out approximately on time. + if probeTime > probeTimeMax { + t.Fatalf("took to long to probe, %9.6f", probeTime.Seconds()) + } + + // Confirm at least one of the peers attempted an indirect probe. + time.Sleep(probeTimeMax) + if m2.sequenceNum != 1 && m3.sequenceNum != 1 { + t.Fatalf("bad seqnos %v, %v", m2.sequenceNum, m3.sequenceNum) + } + + // Since we are using the old protocol here, we should have gotten dinged + // for a failed health check. + if score := m1.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_ProbeNode_Buddy(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 10 * time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, nil) + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + + m1.aliveNode(&a1, nil, true) + m1.aliveNode(&a2, nil, false) + m2.aliveNode(&a2, nil, true) + + // Force the state to suspect so we piggyback a suspect message with the ping. + // We should see this get refuted later, and the ping will succeed. + n := m1.nodeMap[addr2.String()] + n.State = stateSuspect + m1.probeNode(n) + + // Make sure a ping was sent. + if m1.sequenceNum != 1 { + t.Fatalf("bad seqno %v", m1.sequenceNum) + } + + // Check a broadcast is queued. + if num := m2.broadcasts.NumQueued(); num != 1 { + t.Fatalf("expected only one queued message: %d", num) + } + + // Should be alive msg. + if messageType(m2.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { + t.Fatalf("expected queued alive msg") + } +} + +func TestMemberList_ProbeNode(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 10 * time.Millisecond + }) + _ = HostMemberlist(addr2.String(), t, nil) + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + + n := m1.nodeMap[addr2.String()] + m1.probeNode(n) + + // Should be marked alive + if n.State != stateAlive { + t.Fatalf("Expect node to be alive") + } + + // Should increment seqno + if m1.sequenceNum != 1 { + t.Fatalf("bad seqno %v", m1.sequenceNum) + } +} + +func TestMemberList_Ping(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.ProbeTimeout = time.Millisecond + c.ProbeInterval = 10 * time.Second + }) + _ = HostMemberlist(addr2.String(), t, nil) + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + + // Do a legit ping. + n := m1.nodeMap[addr2.String()] + addr, err := net.ResolveUDPAddr("udp", net.JoinHostPort(addr2.String(), "7946")) + if err != nil { + t.Fatalf("err: %v", err) + } + rtt, err := m1.Ping(n.Name, addr) + if err != nil { + t.Fatalf("err: %v", err) + } + if !(rtt > 0) { + t.Fatalf("bad: %v", rtt) + } + + // This ping has a bad node name so should timeout. + _, err = m1.Ping("bad", addr) + if _, ok := err.(NoPingResponseError); !ok || err == nil { + t.Fatalf("bad: %v", err) + } +} + +func TestMemberList_ResetNodes(t *testing.T) { + m := GetMemberlist(t) + a1 := alive{Node: "test1", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a1, nil, false) + a2 := alive{Node: "test2", Addr: []byte{127, 0, 0, 2}, Incarnation: 1} + m.aliveNode(&a2, nil, false) + a3 := alive{Node: "test3", Addr: []byte{127, 0, 0, 3}, Incarnation: 1} + m.aliveNode(&a3, nil, false) + d := dead{Node: "test2", Incarnation: 1} + m.deadNode(&d) + + m.config.GossipToTheDeadTime = 100 * time.Millisecond + m.resetNodes() + if len(m.nodes) != 3 { + t.Fatalf("Bad length") + } + if _, ok := m.nodeMap["test2"]; !ok { + t.Fatalf("test2 should not be unmapped") + } + + time.Sleep(200 * time.Millisecond) + m.resetNodes() + if len(m.nodes) != 2 { + t.Fatalf("Bad length") + } + if _, ok := m.nodeMap["test2"]; ok { + t.Fatalf("test2 should be unmapped") + } +} + +func TestMemberList_NextSeq(t *testing.T) { + m := &Memberlist{} + if m.nextSeqNo() != 1 { + t.Fatalf("bad sequence no") + } + if m.nextSeqNo() != 2 { + t.Fatalf("bad sequence no") + } +} + +func TestMemberList_setProbeChannels(t *testing.T) { + m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} + + ch := make(chan ackMessage, 1) + m.setProbeChannels(0, ch, nil, 10*time.Millisecond) + + if _, ok := m.ackHandlers[0]; !ok { + t.Fatalf("missing handler") + } + time.Sleep(20 * time.Millisecond) + + if _, ok := m.ackHandlers[0]; ok { + t.Fatalf("non-reaped handler") + } +} + +func TestMemberList_setAckHandler(t *testing.T) { + m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} + + f := func([]byte, time.Time) {} + m.setAckHandler(0, f, 10*time.Millisecond) + + if _, ok := m.ackHandlers[0]; !ok { + t.Fatalf("missing handler") + } + time.Sleep(20 * time.Millisecond) + + if _, ok := m.ackHandlers[0]; ok { + t.Fatalf("non-reaped handler") + } +} + +func TestMemberList_invokeAckHandler(t *testing.T) { + m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} + + // Does nothing + m.invokeAckHandler(ackResp{}, time.Now()) + + var b bool + f := func(payload []byte, timestamp time.Time) { b = true } + m.setAckHandler(0, f, 10*time.Millisecond) + + // Should set b + m.invokeAckHandler(ackResp{0, nil}, time.Now()) + if !b { + t.Fatalf("b not set") + } + + if _, ok := m.ackHandlers[0]; ok { + t.Fatalf("non-reaped handler") + } +} + +func TestMemberList_invokeAckHandler_Channel_Ack(t *testing.T) { + m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} + + ack := ackResp{0, []byte{0, 0, 0}} + + // Does nothing + m.invokeAckHandler(ack, time.Now()) + + ackCh := make(chan ackMessage, 1) + nackCh := make(chan struct{}, 1) + m.setProbeChannels(0, ackCh, nackCh, 10*time.Millisecond) + + // Should send message + m.invokeAckHandler(ack, time.Now()) + + select { + case v := <-ackCh: + if v.Complete != true { + t.Fatalf("Bad value") + } + if bytes.Compare(v.Payload, ack.Payload) != 0 { + t.Fatalf("wrong payload. expected: %v; actual: %v", ack.Payload, v.Payload) + } + + case <-nackCh: + t.Fatalf("should not get a nack") + + default: + t.Fatalf("message not sent") + } + + if _, ok := m.ackHandlers[0]; ok { + t.Fatalf("non-reaped handler") + } +} + +func TestMemberList_invokeAckHandler_Channel_Nack(t *testing.T) { + m := &Memberlist{ackHandlers: make(map[uint32]*ackHandler)} + + nack := nackResp{0} + + // Does nothing. + m.invokeNackHandler(nack) + + ackCh := make(chan ackMessage, 1) + nackCh := make(chan struct{}, 1) + m.setProbeChannels(0, ackCh, nackCh, 10*time.Millisecond) + + // Should send message. + m.invokeNackHandler(nack) + + select { + case <-ackCh: + t.Fatalf("should not get an ack") + + case <-nackCh: + // Good. + + default: + t.Fatalf("message not sent") + } + + // Getting a nack doesn't reap the handler so that we can still forward + // an ack up to the reap time, if we get one. + if _, ok := m.ackHandlers[0]; !ok { + t.Fatalf("handler should not be reaped") + } + + ack := ackResp{0, []byte{0, 0, 0}} + m.invokeAckHandler(ack, time.Now()) + + select { + case v := <-ackCh: + if v.Complete != true { + t.Fatalf("Bad value") + } + if bytes.Compare(v.Payload, ack.Payload) != 0 { + t.Fatalf("wrong payload. expected: %v; actual: %v", ack.Payload, v.Payload) + } + + case <-nackCh: + t.Fatalf("should not get a nack") + + default: + t.Fatalf("message not sent") + } + + if _, ok := m.ackHandlers[0]; ok { + t.Fatalf("non-reaped handler") + } +} + +func TestMemberList_AliveNode_NewNode(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + m.config.Events = &ChannelEventDelegate{ch} + + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + if len(m.nodes) != 1 { + t.Fatalf("should add node") + } + + state, ok := m.nodeMap["test"] + if !ok { + t.Fatalf("should map node") + } + + if state.Incarnation != 1 { + t.Fatalf("bad incarnation") + } + if state.State != stateAlive { + t.Fatalf("bad state") + } + if time.Now().Sub(state.StateChange) > time.Second { + t.Fatalf("bad change delta") + } + + // Check for a join message + select { + case e := <-ch: + if e.Node.Name != "test" { + t.Fatalf("bad node name") + } + default: + t.Fatalf("no join message") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected queued message") + } +} + +func TestMemberList_AliveNode_SuspectNode(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + // Listen only after first join + m.config.Events = &ChannelEventDelegate{ch} + + // Make suspect + state := m.nodeMap["test"] + state.State = stateSuspect + state.StateChange = state.StateChange.Add(-time.Hour) + + // Old incarnation number, should not change + m.aliveNode(&a, nil, false) + if state.State != stateSuspect { + t.Fatalf("update with old incarnation!") + } + + // Should reset to alive now + a.Incarnation = 2 + m.aliveNode(&a, nil, false) + if state.State != stateAlive { + t.Fatalf("no update with new incarnation!") + } + + if time.Now().Sub(state.StateChange) > time.Second { + t.Fatalf("bad change delta") + } + + // Check for a no join message + select { + case <-ch: + t.Fatalf("got bad join message") + default: + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected queued message") + } +} + +func TestMemberList_AliveNode_Idempotent(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + // Listen only after first join + m.config.Events = &ChannelEventDelegate{ch} + + // Make suspect + state := m.nodeMap["test"] + stateTime := state.StateChange + + // Should reset to alive now + a.Incarnation = 2 + m.aliveNode(&a, nil, false) + if state.State != stateAlive { + t.Fatalf("non idempotent") + } + + if stateTime != state.StateChange { + t.Fatalf("should not change state") + } + + // Check for a no join message + select { + case <-ch: + t.Fatalf("got bad join message") + default: + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } +} + +// Serf Bug: GH-58, Meta data does not update +func TestMemberList_AliveNode_ChangeMeta(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + + a := alive{ + Node: "test", + Addr: []byte{127, 0, 0, 1}, + Meta: []byte("val1"), + Incarnation: 1} + m.aliveNode(&a, nil, false) + + // Listen only after first join + m.config.Events = &ChannelEventDelegate{ch} + + // Make suspect + state := m.nodeMap["test"] + + // Should reset to alive now + a.Incarnation = 2 + a.Meta = []byte("val2") + m.aliveNode(&a, nil, false) + + // Check updates + if bytes.Compare(state.Meta, a.Meta) != 0 { + t.Fatalf("meta did not update") + } + + // Check for a NotifyUpdate + select { + case e := <-ch: + if e.Event != NodeUpdate { + t.Fatalf("bad event: %v", e) + } + if e.Node != &state.Node { + t.Fatalf("bad event: %v", e) + } + if bytes.Compare(e.Node.Meta, a.Meta) != 0 { + t.Fatalf("meta did not update") + } + default: + t.Fatalf("missing event!") + } + +} + +func TestMemberList_AliveNode_Refute(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, true) + + // Clear queue + m.broadcasts.Reset() + + // Conflicting alive + s := alive{ + Node: m.config.Name, + Addr: []byte{127, 0, 0, 1}, + Incarnation: 2, + Meta: []byte("foo"), + } + m.aliveNode(&s, nil, false) + + state := m.nodeMap[m.config.Name] + if state.State != stateAlive { + t.Fatalf("should still be alive") + } + if state.Meta != nil { + t.Fatalf("meta should still be nil") + } + + // Check a broad cast is queued + if num := m.broadcasts.NumQueued(); num != 1 { + t.Fatalf("expected only one queued message: %d", + num) + } + + // Should be alive mesg + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { + t.Fatalf("expected queued alive msg") + } +} + +func TestMemberList_SuspectNode_NoNode(t *testing.T) { + m := GetMemberlist(t) + s := suspect{Node: "test", Incarnation: 1} + m.suspectNode(&s) + if len(m.nodes) != 0 { + t.Fatalf("don't expect nodes") + } +} + +func TestMemberList_SuspectNode(t *testing.T) { + m := GetMemberlist(t) + m.config.ProbeInterval = time.Millisecond + m.config.SuspicionMult = 1 + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + s := suspect{Node: "test", Incarnation: 1} + m.suspectNode(&s) + + if state.State != stateSuspect { + t.Fatalf("Bad state") + } + + change := state.StateChange + if time.Now().Sub(change) > time.Second { + t.Fatalf("bad change delta") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } + + // Check its a suspect message + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != suspectMsg { + t.Fatalf("expected queued suspect msg") + } + + // Wait for the timeout + time.Sleep(10 * time.Millisecond) + + if state.State != stateDead { + t.Fatalf("Bad state") + } + + if time.Now().Sub(state.StateChange) > time.Second { + t.Fatalf("bad change delta") + } + if !state.StateChange.After(change) { + t.Fatalf("should increment time") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } + + // Check its a suspect message + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != deadMsg { + t.Fatalf("expected queued dead msg") + } +} + +func TestMemberList_SuspectNode_DoubleSuspect(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + s := suspect{Node: "test", Incarnation: 1} + m.suspectNode(&s) + + if state.State != stateSuspect { + t.Fatalf("Bad state") + } + + change := state.StateChange + if time.Now().Sub(change) > time.Second { + t.Fatalf("bad change delta") + } + + // clear the broadcast queue + m.broadcasts.Reset() + + // Suspect again + m.suspectNode(&s) + + if state.StateChange != change { + t.Fatalf("unexpected state change") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 0 { + t.Fatalf("expected only one queued message") + } + +} + +func TestMemberList_SuspectNode_OldSuspect(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} + m.aliveNode(&a, nil, false) + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + // Clear queue + m.broadcasts.Reset() + + s := suspect{Node: "test", Incarnation: 1} + m.suspectNode(&s) + + if state.State != stateAlive { + t.Fatalf("Bad state") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 0 { + t.Fatalf("expected only one queued message") + } +} + +func TestMemberList_SuspectNode_Refute(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, true) + + // Clear queue + m.broadcasts.Reset() + + // Make sure health is in a good state + if score := m.GetHealthScore(); score != 0 { + t.Fatalf("bad: %d", score) + } + + s := suspect{Node: m.config.Name, Incarnation: 1} + m.suspectNode(&s) + + state := m.nodeMap[m.config.Name] + if state.State != stateAlive { + t.Fatalf("should still be alive") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } + + // Should be alive mesg + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { + t.Fatalf("expected queued alive msg") + } + + // Health should have been dinged + if score := m.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_DeadNode_NoNode(t *testing.T) { + m := GetMemberlist(t) + d := dead{Node: "test", Incarnation: 1} + m.deadNode(&d) + if len(m.nodes) != 0 { + t.Fatalf("don't expect nodes") + } +} + +func TestMemberList_DeadNode(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + m.config.Events = &ChannelEventDelegate{ch} + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + // Read the join event + <-ch + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + d := dead{Node: "test", Incarnation: 1} + m.deadNode(&d) + + if state.State != stateDead { + t.Fatalf("Bad state") + } + + change := state.StateChange + if time.Now().Sub(change) > time.Second { + t.Fatalf("bad change delta") + } + + select { + case leave := <-ch: + if leave.Event != NodeLeave || leave.Node.Name != "test" { + t.Fatalf("bad node name") + } + default: + t.Fatalf("no leave message") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } + + // Check its a suspect message + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != deadMsg { + t.Fatalf("expected queued dead msg") + } +} + +func TestMemberList_DeadNode_Double(t *testing.T) { + ch := make(chan NodeEvent, 1) + m := GetMemberlist(t) + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, false) + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + d := dead{Node: "test", Incarnation: 1} + m.deadNode(&d) + + // Clear queue + m.broadcasts.Reset() + + // Notify after the first dead + m.config.Events = &ChannelEventDelegate{ch} + + // Should do nothing + d.Incarnation = 2 + m.deadNode(&d) + + select { + case <-ch: + t.Fatalf("should not get leave") + default: + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 0 { + t.Fatalf("expected only one queued message") + } +} + +func TestMemberList_DeadNode_OldDead(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} + m.aliveNode(&a, nil, false) + + state := m.nodeMap["test"] + state.StateChange = state.StateChange.Add(-time.Hour) + + d := dead{Node: "test", Incarnation: 1} + m.deadNode(&d) + + if state.State != stateAlive { + t.Fatalf("Bad state") + } +} + +func TestMemberList_DeadNode_AliveReplay(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: "test", Addr: []byte{127, 0, 0, 1}, Incarnation: 10} + m.aliveNode(&a, nil, false) + + d := dead{Node: "test", Incarnation: 10} + m.deadNode(&d) + + // Replay alive at same incarnation + m.aliveNode(&a, nil, false) + + // Should remain dead + state, ok := m.nodeMap["test"] + if ok && state.State != stateDead { + t.Fatalf("Bad state") + } +} + +func TestMemberList_DeadNode_Refute(t *testing.T) { + m := GetMemberlist(t) + a := alive{Node: m.config.Name, Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a, nil, true) + + // Clear queue + m.broadcasts.Reset() + + // Make sure health is in a good state + if score := m.GetHealthScore(); score != 0 { + t.Fatalf("bad: %d", score) + } + + d := dead{Node: m.config.Name, Incarnation: 1} + m.deadNode(&d) + + state := m.nodeMap[m.config.Name] + if state.State != stateAlive { + t.Fatalf("should still be alive") + } + + // Check a broad cast is queued + if m.broadcasts.NumQueued() != 1 { + t.Fatalf("expected only one queued message") + } + + // Should be alive mesg + if messageType(m.broadcasts.bcQueue[0].b.Message()[0]) != aliveMsg { + t.Fatalf("expected queued alive msg") + } + + // We should have been dinged + if score := m.GetHealthScore(); score != 1 { + t.Fatalf("bad: %d", score) + } +} + +func TestMemberList_MergeState(t *testing.T) { + m := GetMemberlist(t) + a1 := alive{Node: "test1", Addr: []byte{127, 0, 0, 1}, Incarnation: 1} + m.aliveNode(&a1, nil, false) + a2 := alive{Node: "test2", Addr: []byte{127, 0, 0, 2}, Incarnation: 1} + m.aliveNode(&a2, nil, false) + a3 := alive{Node: "test3", Addr: []byte{127, 0, 0, 3}, Incarnation: 1} + m.aliveNode(&a3, nil, false) + + s := suspect{Node: "test1", Incarnation: 1} + m.suspectNode(&s) + + remote := []pushNodeState{ + pushNodeState{ + Name: "test1", + Addr: []byte{127, 0, 0, 1}, + Incarnation: 2, + State: stateAlive, + }, + pushNodeState{ + Name: "test2", + Addr: []byte{127, 0, 0, 2}, + Incarnation: 1, + State: stateSuspect, + }, + pushNodeState{ + Name: "test3", + Addr: []byte{127, 0, 0, 3}, + Incarnation: 1, + State: stateDead, + }, + pushNodeState{ + Name: "test4", + Addr: []byte{127, 0, 0, 4}, + Incarnation: 2, + State: stateAlive, + }, + } + + // Listen for changes + eventCh := make(chan NodeEvent, 1) + m.config.Events = &ChannelEventDelegate{eventCh} + + // Merge remote state + m.mergeState(remote) + + // Check the states + state := m.nodeMap["test1"] + if state.State != stateAlive || state.Incarnation != 2 { + t.Fatalf("Bad state %v", state) + } + + state = m.nodeMap["test2"] + if state.State != stateSuspect || state.Incarnation != 1 { + t.Fatalf("Bad state %v", state) + } + + state = m.nodeMap["test3"] + if state.State != stateSuspect { + t.Fatalf("Bad state %v", state) + } + + state = m.nodeMap["test4"] + if state.State != stateAlive || state.Incarnation != 2 { + t.Fatalf("Bad state %v", state) + } + + // Check the channels + select { + case e := <-eventCh: + if e.Event != NodeJoin || e.Node.Name != "test4" { + t.Fatalf("bad node %v", e) + } + default: + t.Fatalf("Expect join") + } + + select { + case e := <-eventCh: + t.Fatalf("Unexpect event: %v", e) + default: + } +} + +func TestMemberlist_Gossip(t *testing.T) { + ch := make(chan NodeEvent, 3) + + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.GossipInterval = time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, func(c *Config) { + c.Events = &ChannelEventDelegate{ch} + c.GossipInterval = time.Millisecond + }) + + defer m1.Shutdown() + defer m2.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + a3 := alive{Node: "172.0.0.1", Addr: []byte{172, 0, 0, 1}, Incarnation: 1} + m1.aliveNode(&a3, nil, false) + + // Gossip should send all this to m2 + m1.gossip() + + for i := 0; i < 3; i++ { + select { + case <-ch: + case <-time.After(50 * time.Millisecond): + t.Fatalf("timeout") + } + } +} + +func TestMemberlist_GossipToDead(t *testing.T) { + ch := make(chan NodeEvent, 2) + + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.GossipInterval = time.Millisecond + c.GossipToTheDeadTime = 100 * time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, func(c *Config) { + c.Events = &ChannelEventDelegate{ch} + }) + + defer m1.Shutdown() + defer m2.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + + // Shouldn't send anything to m2 here, node has been dead for 2x the GossipToTheDeadTime + m1.nodeMap[addr2.String()].State = stateDead + m1.nodeMap[addr2.String()].StateChange = time.Now().Add(-200 * time.Millisecond) + m1.gossip() + + select { + case <-ch: + t.Fatalf("shouldn't get gossip") + case <-time.After(50 * time.Millisecond): + } + + // Should gossip to m2 because its state has changed within GossipToTheDeadTime + m1.nodeMap[addr2.String()].StateChange = time.Now().Add(-20 * time.Millisecond) + m1.gossip() + + for i := 0; i < 2; i++ { + select { + case <-ch: + case <-time.After(50 * time.Millisecond): + t.Fatalf("timeout") + } + } +} + +func TestMemberlist_PushPull(t *testing.T) { + addr1 := getBindAddr() + addr2 := getBindAddr() + ip1 := []byte(addr1) + ip2 := []byte(addr2) + + ch := make(chan NodeEvent, 3) + + m1 := HostMemberlist(addr1.String(), t, func(c *Config) { + c.GossipInterval = 10 * time.Second + c.PushPullInterval = time.Millisecond + }) + m2 := HostMemberlist(addr2.String(), t, func(c *Config) { + c.GossipInterval = 10 * time.Second + c.Events = &ChannelEventDelegate{ch} + }) + + defer m1.Shutdown() + defer m2.Shutdown() + + a1 := alive{Node: addr1.String(), Addr: ip1, Port: 7946, Incarnation: 1} + m1.aliveNode(&a1, nil, true) + a2 := alive{Node: addr2.String(), Addr: ip2, Port: 7946, Incarnation: 1} + m1.aliveNode(&a2, nil, false) + + // Gossip should send all this to m2 + m1.pushPull() + + for i := 0; i < 2; i++ { + select { + case <-ch: + case <-time.After(10 * time.Millisecond): + t.Fatalf("timeout") + } + } +} + +func TestVerifyProtocol(t *testing.T) { + cases := []struct { + Anodes [][3]uint8 + Bnodes [][3]uint8 + expected bool + }{ + // Both running identical everything + { + Anodes: [][3]uint8{ + {0, 0, 0}, + }, + Bnodes: [][3]uint8{ + {0, 0, 0}, + }, + expected: true, + }, + + // One can understand newer, but speaking same protocol + { + Anodes: [][3]uint8{ + {0, 0, 0}, + }, + Bnodes: [][3]uint8{ + {0, 1, 0}, + }, + expected: true, + }, + + // One is speaking outside the range + { + Anodes: [][3]uint8{ + {0, 0, 0}, + }, + Bnodes: [][3]uint8{ + {1, 1, 1}, + }, + expected: false, + }, + + // Transitively outside the range + { + Anodes: [][3]uint8{ + {0, 1, 0}, + {0, 2, 1}, + }, + Bnodes: [][3]uint8{ + {1, 3, 1}, + }, + expected: false, + }, + + // Multi-node + { + Anodes: [][3]uint8{ + {0, 3, 2}, + {0, 2, 0}, + }, + Bnodes: [][3]uint8{ + {0, 2, 1}, + {0, 5, 0}, + }, + expected: true, + }, + } + + for _, tc := range cases { + aCore := make([][6]uint8, len(tc.Anodes)) + aApp := make([][6]uint8, len(tc.Anodes)) + for i, n := range tc.Anodes { + aCore[i] = [6]uint8{n[0], n[1], n[2], 0, 0, 0} + aApp[i] = [6]uint8{0, 0, 0, n[0], n[1], n[2]} + } + + bCore := make([][6]uint8, len(tc.Bnodes)) + bApp := make([][6]uint8, len(tc.Bnodes)) + for i, n := range tc.Bnodes { + bCore[i] = [6]uint8{n[0], n[1], n[2], 0, 0, 0} + bApp[i] = [6]uint8{0, 0, 0, n[0], n[1], n[2]} + } + + // Test core protocol verification + testVerifyProtocolSingle(t, aCore, bCore, tc.expected) + testVerifyProtocolSingle(t, bCore, aCore, tc.expected) + + // Test app protocol verification + testVerifyProtocolSingle(t, aApp, bApp, tc.expected) + testVerifyProtocolSingle(t, bApp, aApp, tc.expected) + } +} + +func testVerifyProtocolSingle(t *testing.T, A [][6]uint8, B [][6]uint8, expect bool) { + m := GetMemberlist(t) + defer m.Shutdown() + + m.nodes = make([]*nodeState, len(A)) + for i, n := range A { + m.nodes[i] = &nodeState{ + Node: Node{ + PMin: n[0], + PMax: n[1], + PCur: n[2], + DMin: n[3], + DMax: n[4], + DCur: n[5], + }, + } + } + + remote := make([]pushNodeState, len(B)) + for i, n := range B { + remote[i] = pushNodeState{ + Name: fmt.Sprintf("node %d", i), + Vsn: []uint8{n[0], n[1], n[2], n[3], n[4], n[5]}, + } + } + + err := m.verifyProtocol(remote) + if (err == nil) != expect { + t.Fatalf("bad:\nA: %v\nB: %v\nErr: %s", A, B, err) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/suspicion.go b/vendor/github.com/hashicorp/memberlist/suspicion.go new file mode 100644 index 000000000..5f573e1fc --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/suspicion.go @@ -0,0 +1,130 @@ +package memberlist + +import ( + "math" + "sync/atomic" + "time" +) + +// suspicion manages the suspect timer for a node and provides an interface +// to accelerate the timeout as we get more independent confirmations that +// a node is suspect. +type suspicion struct { + // n is the number of independent confirmations we've seen. This must + // be updated using atomic instructions to prevent contention with the + // timer callback. + n int32 + + // k is the number of independent confirmations we'd like to see in + // order to drive the timer to its minimum value. + k int32 + + // min is the minimum timer value. + min time.Duration + + // max is the maximum timer value. + max time.Duration + + // start captures the timestamp when we began the timer. This is used + // so we can calculate durations to feed the timer during updates in + // a way the achieves the overall time we'd like. + start time.Time + + // timer is the underlying timer that implements the timeout. + timer *time.Timer + + // f is the function to call when the timer expires. We hold on to this + // because there are cases where we call it directly. + timeoutFn func() + + // confirmations is a map of "from" nodes that have confirmed a given + // node is suspect. This prevents double counting. + confirmations map[string]struct{} +} + +// newSuspicion returns a timer started with the max time, and that will drive +// to the min time after seeing k or more confirmations. The from node will be +// excluded from confirmations since we might get our own suspicion message +// gossiped back to us. The minimum time will be used if no confirmations are +// called for (k <= 0). +func newSuspicion(from string, k int, min time.Duration, max time.Duration, fn func(int)) *suspicion { + s := &suspicion{ + k: int32(k), + min: min, + max: max, + confirmations: make(map[string]struct{}), + } + + // Exclude the from node from any confirmations. + s.confirmations[from] = struct{}{} + + // Pass the number of confirmations into the timeout function for + // easy telemetry. + s.timeoutFn = func() { + fn(int(atomic.LoadInt32(&s.n))) + } + + // If there aren't any confirmations to be made then take the min + // time from the start. + timeout := max + if k < 1 { + timeout = min + } + s.timer = time.AfterFunc(timeout, s.timeoutFn) + + // Capture the start time right after starting the timer above so + // we should always err on the side of a little longer timeout if + // there's any preemption that separates this and the step above. + s.start = time.Now() + return s +} + +// remainingSuspicionTime takes the state variables of the suspicion timer and +// calculates the remaining time to wait before considering a node dead. The +// return value can be negative, so be prepared to fire the timer immediately in +// that case. +func remainingSuspicionTime(n, k int32, elapsed time.Duration, min, max time.Duration) time.Duration { + frac := math.Log(float64(n)+1.0) / math.Log(float64(k)+1.0) + raw := max.Seconds() - frac*(max.Seconds()-min.Seconds()) + timeout := time.Duration(math.Floor(1000.0*raw)) * time.Millisecond + if timeout < min { + timeout = min + } + + // We have to take into account the amount of time that has passed so + // far, so we get the right overall timeout. + return timeout - elapsed +} + +// Confirm registers that a possibly new peer has also determined the given +// node is suspect. This returns true if this was new information, and false +// if it was a duplicate confirmation, or if we've got enough confirmations to +// hit the minimum. +func (s *suspicion) Confirm(from string) bool { + // If we've got enough confirmations then stop accepting them. + if atomic.LoadInt32(&s.n) >= s.k { + return false + } + + // Only allow one confirmation from each possible peer. + if _, ok := s.confirmations[from]; ok { + return false + } + s.confirmations[from] = struct{}{} + + // Compute the new timeout given the current number of confirmations and + // adjust the timer. If the timeout becomes negative *and* we can cleanly + // stop the timer then we will call the timeout function directly from + // here. + n := atomic.AddInt32(&s.n, 1) + elapsed := time.Now().Sub(s.start) + remaining := remainingSuspicionTime(n, s.k, elapsed, s.min, s.max) + if s.timer.Stop() { + if remaining > 0 { + s.timer.Reset(remaining) + } else { + go s.timeoutFn() + } + } + return true +} diff --git a/vendor/github.com/hashicorp/memberlist/suspicion_test.go b/vendor/github.com/hashicorp/memberlist/suspicion_test.go new file mode 100644 index 000000000..1b5ca8a5a --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/suspicion_test.go @@ -0,0 +1,198 @@ +package memberlist + +import ( + "testing" + "time" +) + +func TestSuspicion_remainingSuspicionTime(t *testing.T) { + cases := []struct { + n int32 + k int32 + elapsed time.Duration + min time.Duration + max time.Duration + expected time.Duration + }{ + {0, 3, 0, 2 * time.Second, 30 * time.Second, 30 * time.Second}, + {1, 3, 2 * time.Second, 2 * time.Second, 30 * time.Second, 14 * time.Second}, + {2, 3, 3 * time.Second, 2 * time.Second, 30 * time.Second, 4810 * time.Millisecond}, + {3, 3, 4 * time.Second, 2 * time.Second, 30 * time.Second, -2 * time.Second}, + {4, 3, 5 * time.Second, 2 * time.Second, 30 * time.Second, -3 * time.Second}, + {5, 3, 10 * time.Second, 2 * time.Second, 30 * time.Second, -8 * time.Second}, + } + for i, c := range cases { + remaining := remainingSuspicionTime(c.n, c.k, c.elapsed, c.min, c.max) + if remaining != c.expected { + t.Errorf("case %d: remaining %9.6f != expected %9.6f", i, remaining.Seconds(), c.expected.Seconds()) + } + } +} + +func TestSuspicion_Timer(t *testing.T) { + const k = 3 + const min = 500 * time.Millisecond + const max = 2 * time.Second + + type pair struct { + from string + newInfo bool + } + cases := []struct { + numConfirmations int + from string + confirmations []pair + expected time.Duration + }{ + { + 0, + "me", + []pair{}, + max, + }, + { + 1, + "me", + []pair{ + pair{"me", false}, + pair{"foo", true}, + }, + 1250 * time.Millisecond, + }, + { + 1, + "me", + []pair{ + pair{"me", false}, + pair{"foo", true}, + pair{"foo", false}, + pair{"foo", false}, + }, + 1250 * time.Millisecond, + }, + { + 2, + "me", + []pair{ + pair{"me", false}, + pair{"foo", true}, + pair{"bar", true}, + }, + 810 * time.Millisecond, + }, + { + 3, + "me", + []pair{ + pair{"me", false}, + pair{"foo", true}, + pair{"bar", true}, + pair{"baz", true}, + }, + min, + }, + { + 3, + "me", + []pair{ + pair{"me", false}, + pair{"foo", true}, + pair{"bar", true}, + pair{"baz", true}, + pair{"zoo", false}, + }, + min, + }, + } + for i, c := range cases { + ch := make(chan time.Duration, 1) + start := time.Now() + f := func(numConfirmations int) { + if numConfirmations != c.numConfirmations { + t.Errorf("case %d: bad %d != %d", i, numConfirmations, c.numConfirmations) + } + + ch <- time.Now().Sub(start) + } + + // Create the timer and add the requested confirmations. Wait + // the fudge amount to help make sure we calculate the timeout + // overall, and don't accumulate extra time. + s := newSuspicion(c.from, k, min, max, f) + fudge := 25 * time.Millisecond + for _, p := range c.confirmations { + time.Sleep(fudge) + if s.Confirm(p.from) != p.newInfo { + t.Fatalf("case %d: newInfo mismatch for %s", i, p.from) + } + } + + // Wait until right before the timeout and make sure the + // timer hasn't fired. + already := time.Duration(len(c.confirmations)) * fudge + time.Sleep(c.expected - already - fudge) + select { + case d := <-ch: + t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) + default: + } + + // Wait through the timeout and a little after and make sure it + // fires. + time.Sleep(2 * fudge) + select { + case <-ch: + default: + t.Fatalf("case %d: should have fired", i) + } + + // Confirm after to make sure it handles a negative remaining + // time correctly and doesn't fire again. + s.Confirm("late") + time.Sleep(c.expected + 2*fudge) + select { + case d := <-ch: + t.Fatalf("case %d: should not have fired (%9.6f)", i, d.Seconds()) + default: + } + } +} + +func TestSuspicion_Timer_ZeroK(t *testing.T) { + ch := make(chan struct{}, 1) + f := func(int) { + ch <- struct{}{} + } + + // This should select the min time since there are no expected + // confirmations to accelerate the timer. + s := newSuspicion("me", 0, 25*time.Millisecond, 30*time.Second, f) + if s.Confirm("foo") { + t.Fatalf("should not provide new information") + } + + select { + case <-ch: + case <-time.After(50 * time.Millisecond): + t.Fatalf("should have fired") + } +} + +func TestSuspicion_Timer_Immediate(t *testing.T) { + ch := make(chan struct{}, 1) + f := func(int) { + ch <- struct{}{} + } + + // This should underflow the timeout and fire immediately. + s := newSuspicion("me", 1, 100*time.Millisecond, 30*time.Second, f) + time.Sleep(200 * time.Millisecond) + s.Confirm("foo") + + // Wait a little while since the function gets called in a goroutine. + select { + case <-ch: + case <-time.After(25 * time.Millisecond): + t.Fatalf("should have fired") + } +} diff --git a/vendor/github.com/hashicorp/memberlist/tag.sh b/vendor/github.com/hashicorp/memberlist/tag.sh new file mode 100755 index 000000000..cd16623a7 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/tag.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +set -e + +# The version must be supplied from the environment. Do not include the +# leading "v". +if [ -z $VERSION ]; then + echo "Please specify a version." + exit 1 +fi + +# Generate the tag. +echo "==> Tagging version $VERSION..." +git commit --allow-empty -a --gpg-sign=348FFC4C -m "Release v$VERSION" +git tag -a -m "Version $VERSION" -s -u 348FFC4C "v${VERSION}" master + +exit 0 diff --git a/vendor/github.com/hashicorp/memberlist/test/setup_subnet.sh b/vendor/github.com/hashicorp/memberlist/test/setup_subnet.sh new file mode 100755 index 000000000..6651c8ce4 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/test/setup_subnet.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# This script makes sure that 127.0.0.x is routable. On Darwin, there +# is a bug that it isn't routable and this causes errors. +# + +# Check if loopback is setup +ping -c 1 -W 10 127.0.0.2 > /dev/null 2>&1 +if [ $? -eq 0 ] +then + exit +fi + +# If we're not on OS X, then error +case $OSTYPE in + darwin*) + ;; + *) + echo "Can't setup interfaces on non-Mac. Error!" + exit 1 + ;; +esac + +# Setup loopback +for ((i=2;i<256;i++)) +do + sudo ifconfig lo0 alias 127.0.0.$i up +done diff --git a/vendor/github.com/hashicorp/memberlist/todo.md b/vendor/github.com/hashicorp/memberlist/todo.md new file mode 100644 index 000000000..009c1d647 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/todo.md @@ -0,0 +1,6 @@ +# TODO +* Dynamic RTT discovery + * Compute 99th percentile for ping/ack + * Better lower bound for ping/ack, faster failure detection +* Dynamic MTU discovery + * Prevent lost updates, increases efficiency diff --git a/vendor/github.com/hashicorp/memberlist/transport.go b/vendor/github.com/hashicorp/memberlist/transport.go new file mode 100644 index 000000000..ca0a66083 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/transport.go @@ -0,0 +1,65 @@ +package memberlist + +import ( + "net" + "time" +) + +// Packet is used to provide some metadata about incoming packets from peers +// over a packet connection, as well as the packet payload. +type Packet struct { + // Buf has the raw contents of the packet. + Buf []byte + + // From has the address of the peer. This is an actual net.Addr so we + // can expose some concrete details about incoming packets. + From net.Addr + + // Timestamp is the time when the packet was received. This should be + // taken as close as possible to the actual receipt time to help make an + // accurate RTT measurements during probes. + Timestamp time.Time +} + +// Transport is used to abstract over communicating with other peers. The packet +// interface is assumed to be best-effort and the stream interface is assumed to +// be reliable. +type Transport interface { + // FinalAdvertiseAddr is given the user's configured values (which + // might be empty) and returns the desired IP and port to advertise to + // the rest of the cluster. + FinalAdvertiseAddr(ip string, port int) (net.IP, int, error) + + // WriteTo is a packet-oriented interface that fires off the given + // payload to the given address in a connectionless fashion. This should + // return a time stamp that's as close as possible to when the packet + // was transmitted to help make accurate RTT measurements during probes. + // + // This is similar to net.PacketConn, though we didn't want to expose + // that full set of required methods to keep assumptions about the + // underlying plumbing to a minimum. We also treat the address here as a + // string, similar to Dial, so it's network neutral, so this usually is + // in the form of "host:port". + WriteTo(b []byte, addr string) (time.Time, error) + + // PacketCh returns a channel that can be read to receive incoming + // packets from other peers. How this is set up for listening is left as + // an exercise for the concrete transport implementations. + PacketCh() <-chan *Packet + + // DialTimeout is used to create a connection that allows us to perform + // two-way communication with a peer. This is generally more expensive + // than packet connections so is used for more infrequent operations + // such as anti-entropy or fallback probes if the packet-oriented probe + // failed. + DialTimeout(addr string, timeout time.Duration) (net.Conn, error) + + // StreamCh returns a channel that can be read to handle incoming stream + // connections from other peers. How this is set up for listening is + // left as an exercise for the concrete transport implementations. + StreamCh() <-chan net.Conn + + // Shutdown is called when memberlist is shutting down; this gives the + // transport a chance to clean up any listeners. + Shutdown() error +} diff --git a/vendor/github.com/hashicorp/memberlist/transport_test.go b/vendor/github.com/hashicorp/memberlist/transport_test.go new file mode 100644 index 000000000..b5249eb5f --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/transport_test.go @@ -0,0 +1,124 @@ +package memberlist + +import ( + "bytes" + "testing" + "time" +) + +func TestTransport_Join(t *testing.T) { + net := &MockNetwork{} + + t1 := net.NewTransport() + + c1 := DefaultLANConfig() + c1.Name = "node1" + c1.Transport = t1 + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %v", err) + } + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + c2 := DefaultLANConfig() + c2.Name = "node2" + c2.Transport = net.NewTransport() + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %v", err) + } + m2.setAlive() + m2.schedule() + defer m2.Shutdown() + + num, err := m2.Join([]string{t1.addr.String()}) + if num != 1 { + t.Fatalf("bad: %d", num) + } + if err != nil { + t.Fatalf("err: %v", err) + } + + if len(m2.Members()) != 2 { + t.Fatalf("bad: %v", m2.Members()) + } + if m2.estNumNodes() != 2 { + t.Fatalf("bad: %v", m2.Members()) + } + +} + +func TestTransport_Send(t *testing.T) { + net := &MockNetwork{} + + t1 := net.NewTransport() + d1 := &MockDelegate{} + + c1 := DefaultLANConfig() + c1.Name = "node1" + c1.Transport = t1 + c1.Delegate = d1 + m1, err := Create(c1) + if err != nil { + t.Fatalf("err: %v", err) + } + m1.setAlive() + m1.schedule() + defer m1.Shutdown() + + c2 := DefaultLANConfig() + c2.Name = "node2" + c2.Transport = net.NewTransport() + m2, err := Create(c2) + if err != nil { + t.Fatalf("err: %v", err) + } + m2.setAlive() + m2.schedule() + defer m2.Shutdown() + + num, err := m2.Join([]string{t1.addr.String()}) + if num != 1 { + t.Fatalf("bad: %d", num) + } + if err != nil { + t.Fatalf("err: %v", err) + } + + if err := m2.SendTo(t1.addr, []byte("SendTo")); err != nil { + t.Fatalf("err: %v", err) + } + + var n1 *Node + for _, n := range m2.Members() { + if n.Name == c1.Name { + n1 = n + break + } + } + if n1 == nil { + t.Fatalf("bad") + } + + if err := m2.SendToUDP(n1, []byte("SendToUDP")); err != nil { + t.Fatalf("err: %v", err) + } + if err := m2.SendToTCP(n1, []byte("SendToTCP")); err != nil { + t.Fatalf("err: %v", err) + } + if err := m2.SendBestEffort(n1, []byte("SendBestEffort")); err != nil { + t.Fatalf("err: %v", err) + } + if err := m2.SendReliable(n1, []byte("SendReliable")); err != nil { + t.Fatalf("err: %v", err) + } + time.Sleep(100 * time.Millisecond) + + received := bytes.Join(d1.msgs, []byte("|")) + expected := []byte("SendTo|SendToUDP|SendToTCP|SendBestEffort|SendReliable") + if !bytes.Equal(received, expected) { + t.Fatalf("bad: %s", received) + } +} diff --git a/vendor/github.com/hashicorp/memberlist/util.go b/vendor/github.com/hashicorp/memberlist/util.go new file mode 100644 index 000000000..a4f926e3a --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/util.go @@ -0,0 +1,296 @@ +package memberlist + +import ( + "bytes" + "compress/lzw" + "encoding/binary" + "fmt" + "io" + "math" + "math/rand" + "net" + "strconv" + "strings" + "time" + + "github.com/hashicorp/go-msgpack/codec" + "github.com/sean-/seed" +) + +// pushPullScale is the minimum number of nodes +// before we start scaling the push/pull timing. The scale +// effect is the log2(Nodes) - log2(pushPullScale). This means +// that the 33rd node will cause us to double the interval, +// while the 65th will triple it. +const pushPullScaleThreshold = 32 + +const ( + // Constant litWidth 2-8 + lzwLitWidth = 8 +) + +func init() { + seed.Init() +} + +// Decode reverses the encode operation on a byte slice input +func decode(buf []byte, out interface{}) error { + r := bytes.NewReader(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer +func encode(msgType messageType, in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + buf.WriteByte(uint8(msgType)) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// Returns a random offset between 0 and n +func randomOffset(n int) int { + if n == 0 { + return 0 + } + return int(rand.Uint32() % uint32(n)) +} + +// suspicionTimeout computes the timeout that should be used when +// a node is suspected +func suspicionTimeout(suspicionMult, n int, interval time.Duration) time.Duration { + nodeScale := math.Max(1.0, math.Log10(math.Max(1.0, float64(n)))) + // multiply by 1000 to keep some precision because time.Duration is an int64 type + timeout := time.Duration(suspicionMult) * time.Duration(nodeScale*1000) * interval / 1000 + return timeout +} + +// retransmitLimit computes the limit of retransmissions +func retransmitLimit(retransmitMult, n int) int { + nodeScale := math.Ceil(math.Log10(float64(n + 1))) + limit := retransmitMult * int(nodeScale) + return limit +} + +// shuffleNodes randomly shuffles the input nodes using the Fisher-Yates shuffle +func shuffleNodes(nodes []*nodeState) { + n := len(nodes) + for i := n - 1; i > 0; i-- { + j := rand.Intn(i + 1) + nodes[i], nodes[j] = nodes[j], nodes[i] + } +} + +// pushPushScale is used to scale the time interval at which push/pull +// syncs take place. It is used to prevent network saturation as the +// cluster size grows +func pushPullScale(interval time.Duration, n int) time.Duration { + // Don't scale until we cross the threshold + if n <= pushPullScaleThreshold { + return interval + } + + multiplier := math.Ceil(math.Log2(float64(n))-math.Log2(pushPullScaleThreshold)) + 1.0 + return time.Duration(multiplier) * interval +} + +// moveDeadNodes moves nodes that are dead and beyond the gossip to the dead interval +// to the end of the slice and returns the index of the first moved node. +func moveDeadNodes(nodes []*nodeState, gossipToTheDeadTime time.Duration) int { + numDead := 0 + n := len(nodes) + for i := 0; i < n-numDead; i++ { + if nodes[i].State != stateDead { + continue + } + + // Respect the gossip to the dead interval + if time.Since(nodes[i].StateChange) <= gossipToTheDeadTime { + continue + } + + // Move this node to the end + nodes[i], nodes[n-numDead-1] = nodes[n-numDead-1], nodes[i] + numDead++ + i-- + } + return n - numDead +} + +// kRandomNodes is used to select up to k random nodes, excluding any nodes where +// the filter function returns true. It is possible that less than k nodes are +// returned. +func kRandomNodes(k int, nodes []*nodeState, filterFn func(*nodeState) bool) []*nodeState { + n := len(nodes) + kNodes := make([]*nodeState, 0, k) +OUTER: + // Probe up to 3*n times, with large n this is not necessary + // since k << n, but with small n we want search to be + // exhaustive + for i := 0; i < 3*n && len(kNodes) < k; i++ { + // Get random node + idx := randomOffset(n) + node := nodes[idx] + + // Give the filter a shot at it. + if filterFn != nil && filterFn(node) { + continue OUTER + } + + // Check if we have this node already + for j := 0; j < len(kNodes); j++ { + if node == kNodes[j] { + continue OUTER + } + } + + // Append the node + kNodes = append(kNodes, node) + } + return kNodes +} + +// makeCompoundMessage takes a list of messages and generates +// a single compound message containing all of them +func makeCompoundMessage(msgs [][]byte) *bytes.Buffer { + // Create a local buffer + buf := bytes.NewBuffer(nil) + + // Write out the type + buf.WriteByte(uint8(compoundMsg)) + + // Write out the number of message + buf.WriteByte(uint8(len(msgs))) + + // Add the message lengths + for _, m := range msgs { + binary.Write(buf, binary.BigEndian, uint16(len(m))) + } + + // Append the messages + for _, m := range msgs { + buf.Write(m) + } + + return buf +} + +// decodeCompoundMessage splits a compound message and returns +// the slices of individual messages. Also returns the number +// of truncated messages and any potential error +func decodeCompoundMessage(buf []byte) (trunc int, parts [][]byte, err error) { + if len(buf) < 1 { + err = fmt.Errorf("missing compound length byte") + return + } + numParts := uint8(buf[0]) + buf = buf[1:] + + // Check we have enough bytes + if len(buf) < int(numParts*2) { + err = fmt.Errorf("truncated len slice") + return + } + + // Decode the lengths + lengths := make([]uint16, numParts) + for i := 0; i < int(numParts); i++ { + lengths[i] = binary.BigEndian.Uint16(buf[i*2 : i*2+2]) + } + buf = buf[numParts*2:] + + // Split each message + for idx, msgLen := range lengths { + if len(buf) < int(msgLen) { + trunc = int(numParts) - idx + return + } + + // Extract the slice, seek past on the buffer + slice := buf[:msgLen] + buf = buf[msgLen:] + parts = append(parts, slice) + } + return +} + +// Given a string of the form "host", "host:port", +// "ipv6::addr" or "[ipv6::address]:port", +// return true if the string includes a port. +func hasPort(s string) bool { + last := strings.LastIndex(s, ":") + if last == -1 { + return false + } + if s[0] == '[' { + return s[last-1] == ']' + } + return strings.Index(s, ":") == last +} + +// compressPayload takes an opaque input buffer, compresses it +// and wraps it in a compress{} message that is encoded. +func compressPayload(inp []byte) (*bytes.Buffer, error) { + var buf bytes.Buffer + compressor := lzw.NewWriter(&buf, lzw.LSB, lzwLitWidth) + + _, err := compressor.Write(inp) + if err != nil { + return nil, err + } + + // Ensure we flush everything out + if err := compressor.Close(); err != nil { + return nil, err + } + + // Create a compressed message + c := compress{ + Algo: lzwAlgo, + Buf: buf.Bytes(), + } + return encode(compressMsg, &c) +} + +// decompressPayload is used to unpack an encoded compress{} +// message and return its payload uncompressed +func decompressPayload(msg []byte) ([]byte, error) { + // Decode the message + var c compress + if err := decode(msg, &c); err != nil { + return nil, err + } + return decompressBuffer(&c) +} + +// decompressBuffer is used to decompress the buffer of +// a single compress message, handling multiple algorithms +func decompressBuffer(c *compress) ([]byte, error) { + // Verify the algorithm + if c.Algo != lzwAlgo { + return nil, fmt.Errorf("Cannot decompress unknown algorithm %d", c.Algo) + } + + // Create a uncompressor + uncomp := lzw.NewReader(bytes.NewReader(c.Buf), lzw.LSB, lzwLitWidth) + defer uncomp.Close() + + // Read all the data + var b bytes.Buffer + _, err := io.Copy(&b, uncomp) + if err != nil { + return nil, err + } + + // Return the uncompressed bytes + return b.Bytes(), nil +} + +// joinHostPort returns the host:port form of an address, for use with a +// transport. +func joinHostPort(host string, port uint16) string { + return net.JoinHostPort(host, strconv.Itoa(int(port))) +} diff --git a/vendor/github.com/hashicorp/memberlist/util_test.go b/vendor/github.com/hashicorp/memberlist/util_test.go new file mode 100644 index 000000000..e1d8eba01 --- /dev/null +++ b/vendor/github.com/hashicorp/memberlist/util_test.go @@ -0,0 +1,351 @@ +package memberlist + +import ( + "fmt" + "reflect" + "testing" + "time" +) + +func Test_hasPort(t *testing.T) { + cases := []struct { + s string + expected bool + }{ + {"", false}, + {":80", true}, + {"127.0.0.1", false}, + {"127.0.0.1:80", true}, + {"::1", false}, + {"2001:db8:a0b:12f0::1", false}, + {"[2001:db8:a0b:12f0::1]", false}, + {"[2001:db8:a0b:12f0::1]:80", true}, + } + for _, c := range cases { + if hasPort(c.s) != c.expected { + t.Fatalf("bad: '%s' hasPort was not %v", c.s, c.expected) + } + } +} + +func TestEncodeDecode(t *testing.T) { + msg := &ping{SeqNo: 100} + buf, err := encode(pingMsg, msg) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + var out ping + if err := decode(buf.Bytes()[1:], &out); err != nil { + t.Fatalf("unexpected err: %s", err) + } + if msg.SeqNo != out.SeqNo { + t.Fatalf("bad sequence no") + } +} + +func TestRandomOffset(t *testing.T) { + vals := make(map[int]struct{}) + for i := 0; i < 100; i++ { + offset := randomOffset(2 << 30) + if _, ok := vals[offset]; ok { + t.Fatalf("got collision") + } + vals[offset] = struct{}{} + } +} + +func TestRandomOffset_Zero(t *testing.T) { + offset := randomOffset(0) + if offset != 0 { + t.Fatalf("bad offset") + } +} + +func TestSuspicionTimeout(t *testing.T) { + timeouts := map[int]time.Duration{ + 5: 1000 * time.Millisecond, + 10: 1000 * time.Millisecond, + 50: 1698 * time.Millisecond, + 100: 2000 * time.Millisecond, + 500: 2698 * time.Millisecond, + 1000: 3000 * time.Millisecond, + } + for n, expected := range timeouts { + timeout := suspicionTimeout(3, n, time.Second) / 3 + if timeout != expected { + t.Fatalf("bad: %v, %v", expected, timeout) + } + } +} + +func TestRetransmitLimit(t *testing.T) { + lim := retransmitLimit(3, 0) + if lim != 0 { + t.Fatalf("bad val %v", lim) + } + lim = retransmitLimit(3, 1) + if lim != 3 { + t.Fatalf("bad val %v", lim) + } + lim = retransmitLimit(3, 99) + if lim != 6 { + t.Fatalf("bad val %v", lim) + } +} + +func TestShuffleNodes(t *testing.T) { + orig := []*nodeState{ + &nodeState{ + State: stateDead, + }, + &nodeState{ + State: stateAlive, + }, + &nodeState{ + State: stateAlive, + }, + &nodeState{ + State: stateDead, + }, + &nodeState{ + State: stateAlive, + }, + &nodeState{ + State: stateAlive, + }, + &nodeState{ + State: stateDead, + }, + &nodeState{ + State: stateAlive, + }, + } + nodes := make([]*nodeState, len(orig)) + copy(nodes[:], orig[:]) + + if !reflect.DeepEqual(nodes, orig) { + t.Fatalf("should match") + } + + shuffleNodes(nodes) + + if reflect.DeepEqual(nodes, orig) { + t.Fatalf("should not match") + } +} + +func TestPushPullScale(t *testing.T) { + sec := time.Second + for i := 0; i <= 32; i++ { + if s := pushPullScale(sec, i); s != sec { + t.Fatalf("Bad time scale: %v", s) + } + } + for i := 33; i <= 64; i++ { + if s := pushPullScale(sec, i); s != 2*sec { + t.Fatalf("Bad time scale: %v", s) + } + } + for i := 65; i <= 128; i++ { + if s := pushPullScale(sec, i); s != 3*sec { + t.Fatalf("Bad time scale: %v", s) + } + } +} + +func TestMoveDeadNodes(t *testing.T) { + nodes := []*nodeState{ + &nodeState{ + State: stateDead, + StateChange: time.Now().Add(-20 * time.Second), + }, + &nodeState{ + State: stateAlive, + StateChange: time.Now().Add(-20 * time.Second), + }, + // This dead node should not be moved, as its state changed + // less than the specified GossipToTheDead time ago + &nodeState{ + State: stateDead, + StateChange: time.Now().Add(-10 * time.Second), + }, + &nodeState{ + State: stateAlive, + StateChange: time.Now().Add(-20 * time.Second), + }, + &nodeState{ + State: stateDead, + StateChange: time.Now().Add(-20 * time.Second), + }, + &nodeState{ + State: stateAlive, + StateChange: time.Now().Add(-20 * time.Second), + }, + } + + idx := moveDeadNodes(nodes, (15 * time.Second)) + if idx != 4 { + t.Fatalf("bad index") + } + for i := 0; i < idx; i++ { + switch i { + case 2: + // Recently dead node remains at index 2, + // since nodes are swapped out to move to end. + if nodes[i].State != stateDead { + t.Fatalf("Bad state %d", i) + } + default: + if nodes[i].State != stateAlive { + t.Fatalf("Bad state %d", i) + } + } + } + for i := idx; i < len(nodes); i++ { + if nodes[i].State != stateDead { + t.Fatalf("Bad state %d", i) + } + } +} + +func TestKRandomNodes(t *testing.T) { + nodes := []*nodeState{} + for i := 0; i < 90; i++ { + // Half the nodes are in a bad state + state := stateAlive + switch i % 3 { + case 0: + state = stateAlive + case 1: + state = stateSuspect + case 2: + state = stateDead + } + nodes = append(nodes, &nodeState{ + Node: Node{ + Name: fmt.Sprintf("test%d", i), + }, + State: state, + }) + } + + filterFunc := func(n *nodeState) bool { + if n.Name == "test0" || n.State != stateAlive { + return true + } + return false + } + + s1 := kRandomNodes(3, nodes, filterFunc) + s2 := kRandomNodes(3, nodes, filterFunc) + s3 := kRandomNodes(3, nodes, filterFunc) + + if reflect.DeepEqual(s1, s2) { + t.Fatalf("unexpected equal") + } + if reflect.DeepEqual(s1, s3) { + t.Fatalf("unexpected equal") + } + if reflect.DeepEqual(s2, s3) { + t.Fatalf("unexpected equal") + } + + for _, s := range [][]*nodeState{s1, s2, s3} { + if len(s) != 3 { + t.Fatalf("bad len") + } + for _, n := range s { + if n.Name == "test0" { + t.Fatalf("Bad name") + } + if n.State != stateAlive { + t.Fatalf("Bad state") + } + } + } +} + +func TestMakeCompoundMessage(t *testing.T) { + msg := &ping{SeqNo: 100} + buf, err := encode(pingMsg, msg) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} + compound := makeCompoundMessage(msgs) + + if compound.Len() != 3*buf.Len()+3*compoundOverhead+compoundHeaderOverhead { + t.Fatalf("bad len") + } +} + +func TestDecodeCompoundMessage(t *testing.T) { + msg := &ping{SeqNo: 100} + buf, err := encode(pingMsg, msg) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} + compound := makeCompoundMessage(msgs) + + trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:]) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + if trunc != 0 { + t.Fatalf("should not truncate") + } + if len(parts) != 3 { + t.Fatalf("bad parts") + } + for _, p := range parts { + if len(p) != buf.Len() { + t.Fatalf("bad part len") + } + } +} + +func TestDecodeCompoundMessage_Trunc(t *testing.T) { + msg := &ping{SeqNo: 100} + buf, err := encode(pingMsg, msg) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + msgs := [][]byte{buf.Bytes(), buf.Bytes(), buf.Bytes()} + compound := makeCompoundMessage(msgs) + + trunc, parts, err := decodeCompoundMessage(compound.Bytes()[1:38]) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + if trunc != 1 { + t.Fatalf("truncate: %d", trunc) + } + if len(parts) != 2 { + t.Fatalf("bad parts") + } + for _, p := range parts { + if len(p) != buf.Len() { + t.Fatalf("bad part len") + } + } +} + +func TestCompressDecompressPayload(t *testing.T) { + buf, err := compressPayload([]byte("testing")) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + decomp, err := decompressPayload(buf.Bytes()[1:]) + if err != nil { + t.Fatalf("unexpected err: %s", err) + } + + if !reflect.DeepEqual(decomp, []byte("testing")) { + t.Fatalf("bad payload: %v", decomp) + } +} |