diff --git a/etcd.go b/etcd.go index 1a763869..4ec685b8 100644 --- a/etcd.go +++ b/etcd.go @@ -1241,7 +1241,19 @@ func (obj *EmbdEtcd) volunteerCallback(re *RE) error { if removed { log.Printf("Etcd: Member Removed (forced): %v(%v)", quitter, mID) } - return &CtxDelayErr{1 * time.Second, fmt.Sprintf("Member %s (%d) removed successfully!", quitter, mID)} // retry asap + + // Remove the endpoint from our list to avoid blocking + // future MemberList calls which would try and connect + // to a missing endpoint... The endpoints should get + // updated from the member exiting safely if it doesn't + // crash, but if it did and/or since it's a race to see + // if the update event will get seen before we need the + // new data, just do it now anyways, then update the + // endpoint list and trigger a reconnect. + delete(obj.endpoints, quitter) // proactively delete it + obj.endpointCallback(nil) // update! + log.Printf("Member %s (%d) removed successfully!", quitter, mID) + return &CtxReconnectErr{"a member was removed"} // retry asap and update endpoint list } else { // programming error @@ -1852,7 +1864,10 @@ func EtcdMembers(obj *EmbdEtcd) (map[uint64]string, error) { return nil, fmt.Errorf("Exiting...") } obj.rLock.RLock() - response, err = obj.client.MemberList(context.Background()) + if TRACE { + log.Printf("Trace: Etcd: EtcdMembers(): Endpoints are: %v", obj.client.Endpoints()) + } + response, err = obj.client.MemberList(ctx) obj.rLock.RUnlock() if err == nil { break diff --git a/test/shell/t8.sh b/test/shell/t8.sh new file mode 100755 index 00000000..ed2da33f --- /dev/null +++ b/test/shell/t8.sh @@ -0,0 +1,22 @@ +#!/bin/bash -e + +# run empty graphs, we're just testing etcd clustering +timeout --kill-after=120s 90s ./mgmt run --hostname h1 & +pid1=$! +sleep 5s # let it startup + +timeout --kill-after=120s 90s ./mgmt run --hostname h2 --seeds http://127.0.0.1:2379 --client-urls http://127.0.0.1:2381 --server-urls http://127.0.0.1:2382 & +pid2=$! +sleep 5s + +$(sleep 5s && kill -SIGINT $pid2)& # send ^C to exit 2nd mgmt +wait $pid2 +e=$? +if [ $e -ne 0 ]; then + exit $e +fi + +$(sleep 5s && kill -SIGINT $pid1)& # send ^C to exit 1st mgmt +wait $pid1 # get exit status +# if pid1 exits because of a timeout, then it blocked, and this is a bug! +exit $?