etcd: ssh: Reconnect on SSH failures

If the SSH connection dies, the dialer can now reconnect that portion.
This commit is contained in:
James Shubin
2025-06-02 15:08:37 -04:00
parent f42daf4509
commit 81063ae6df

View File

@@ -254,14 +254,23 @@ func (obj *World) Connect(ctx context.Context, init *engine.WorldInit) error {
return e return e
}) })
tunnels := make(map[string]net.Conn) // This runs repeatedly when etcd tries to reconnect.
for _, seed := range obj.Seeds { grpcWithContextDialerFunc := func(ctx context.Context, addr string) (net.Conn, error) {
addr := seedSSH[seed] var reterr error
for _, seed := range obj.Seeds { // first successful connect wins
if addr != seedSSH[seed] {
continue // not what we're expecting
}
obj.init.Logf("tunnel: %s", addr) obj.init.Logf("tunnel: %s", addr)
tunnel, err := obj.sshClient.Dial("tcp", addr) tunnel, err := obj.sshClient.Dial("tcp", addr)
if err != nil { if err != nil {
return errwrap.Append(obj.cleanup(), err) reterr = err
obj.init.Logf("ssh dial error: %v", err)
continue
} }
// TODO: do we need a mutex around adding these?
obj.cleanups = append(obj.cleanups, func() error { obj.cleanups = append(obj.cleanups, func() error {
e := tunnel.Close() e := tunnel.Close()
if e == io.EOF { // XXX: why does this happen? if e == io.EOF { // XXX: why does this happen?
@@ -272,22 +281,20 @@ func (obj *World) Connect(ctx context.Context, init *engine.WorldInit) error {
} }
return e return e
}) })
tunnels[addr] = tunnel
return tunnel, nil // connected successfully
}
if reterr != nil {
return nil, reterr
}
return nil, fmt.Errorf("no ssh tunnels available") // TODO: better error message?
} }
etcdClient, err := clientv3.New(clientv3.Config{ etcdClient, err := clientv3.New(clientv3.Config{
Endpoints: obj.Seeds, Endpoints: obj.Seeds,
DialOptions: []grpc.DialOption{ DialOptions: []grpc.DialOption{
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { grpc.WithContextDialer(grpcWithContextDialerFunc),
tunnel, exists := tunnels[addr]
if !exists {
obj.init.Logf("can't find tunnel: %s", addr) // tell user early...
return nil, fmt.Errorf("can't find tunnel: %s", addr)
}
// TODO: print the scheme here on this log msg
obj.init.Logf("etcd dial: %s", addr)
return tunnel, nil
}),
}, },
}) })
if err != nil { if err != nil {