Remote "agent-less" mode

This is a new mode to be used for bootstrapping mgmt clusters or in situations with tight operational restrictions. This includes the basics, additional functionality will follow!
2016-08-03 05:00:40 -04:00
parent bdb970203c
commit 7032eea045
7 changed files with 977 additions and 3 deletions
--- a/DOCUMENTATION.md
+++ b/DOCUMENTATION.md
@@ -34,6 +34,7 @@ along with this program.  If not, see <http://www.gnu.org/licenses/>.
 	* [Autoedges - Automatic resource relationships](#autoedges)
 	* [Autogrouping - Automatic resource grouping](#autogrouping)
 	* [Automatic clustering - Automatic cluster management](#automatic-clustering)
+	* [Remote mode - Remote "agent-less" execution](#remote-agent-less-mode)
 5. [Usage/FAQ - Notes on usage and frequently asked questions](#usage-and-frequently-asked-questions)
 6. [Reference - Detailed reference](#reference)
 	* [Graph definition file](#graph-definition-file)
@@ -142,6 +143,27 @@ with the `--seeds` variable.
 You can read the introductory blog post about this topic here:
 [https://ttboj.wordpress.com/2016/06/20/automatic-clustering-in-mgmt/](https://ttboj.wordpress.com/2016/06/20/automatic-clustering-in-mgmt/)

+###Remote ("agent-less") mode
+
+Remote mode is a special mode that lets you kick off mgmt runs on one or more
+remote machines which are only accessible via SSH. In this mode the initiating
+host connects over SSH, copies over the `mgmt` binary, opens an SSH tunnel, and
+runs the remote program while simultaneously passing the etcd traffic back
+through the tunnel so that the initiators etcd cluster can be used to exchange
+resource data.
+
+The interesting benefit of this architecture is that multiple hosts which can't
+connect directly use the initiator to pass the important traffic through to each
+other. Once the cluster has converged all the remote programs can shutdown
+leaving no residual agent.
+
+This mode can also be useful for bootstrapping a new host where you'd like to
+have the service run continuously and as part of an mgmt cluster normally.
+
+####Blog post
+
+An introductory blog post about this topic will follow soon.
+
 ##Usage and frequently asked questions
 (Send your questions as a patch to this FAQ! I'll review it, merge it, and
 respond by commit with the answer.)
@@ -222,6 +244,11 @@ Globally force all resources into no-op mode. This also disables the export to
 etcd functionality, but does not disable resource collection, however all
 resources that are collected will have their individual noop settings set.

+####`--remote <graph.yaml>`
+Point to a graph file to run on the remote host specified within. This parameter
+can be used multiple times if you'd like to remotely run on multiple hosts in
+parallel.
+
 ##Examples
 For example configurations, please consult the [examples/](https://github.com/purpleidea/mgmt/tree/master/examples) directory in the git
 source repository. It is available from:
--- a/config.go
+++ b/config.go
@@ -56,6 +56,7 @@ type GraphConfig struct {
 	Collector []collectorResConfig `yaml:"collect"`
 	Edges     []edgeConfig         `yaml:"edges"`
 	Comment   string               `yaml:"comment"`
+	Remote    string               `yaml:"remote"`
 }

 func (c *GraphConfig) Parse(data []byte) error {
--- a/etcd.go
+++ b/etcd.go
@@ -77,6 +77,8 @@ const (
 	exitDelay               = 3                // number of sec of inactivity after exit to clean up
 	defaultIdealClusterSize = 5                // default ideal cluster size target for initial seed
 	tempPrefix              = "tmp-mgmt-etcd-" // XXX use some special mgmt tmp dir
+	DefaultClientURL        = "127.0.0.1:2379"
+	DefaultServerURL        = "127.0.0.1:2380"
 )

 var (
@@ -214,7 +216,7 @@ func NewEmbdEtcd(hostname string, seeds, clientURLs, serverURLs etcdtypes.URLs,
 	// TODO: add some sort of auto assign method for picking these defaults
 	// add a default so that our local client can connect locally if needed
 	if len(obj.LocalhostClientURLs()) == 0 { // if we don't have any localhost URLs
-		u := url.URL{Scheme: "http", Host: "127.0.0.1:2379"}     // default
+		u := url.URL{Scheme: "http", Host: DefaultClientURL}     // default
 		obj.clientURLs = append([]url.URL{u}, obj.clientURLs...) // prepend
 	}

@@ -223,7 +225,7 @@ func NewEmbdEtcd(hostname string, seeds, clientURLs, serverURLs etcdtypes.URLs,
 		if len(obj.endpoints) > 0 {
 			obj.noServer = true // we didn't have enough to be a server
 		}
-		u := url.URL{Scheme: "http", Host: "127.0.0.1:2380"} // default
+		u := url.URL{Scheme: "http", Host: DefaultServerURL} // default
 		obj.serverURLs = []url.URL{u}
 	}

--- a/examples/remote1.yaml
+++ b/examples/remote1.yaml
@@ -0,0 +1,23 @@
+---
+graph: mygraph
+comment: remote noop example
+resources:
+  noop:
+  - name: noop1
+    meta:
+      noop: true
+  file:
+  - name: file1
+    path: "/tmp/mgmt-remote-hello"
+    content: |
+      hello world from @purpleidea
+    state: exists
+edges:
+- name: e1
+  from:
+    kind: noop
+    name: noop1
+  to:
+    kind: file
+    name: file1
+remote: "ssh://root:password@hostname:22"
--- a/main.go
+++ b/main.go
@@ -104,6 +104,21 @@ func run(c *cli.Context) error {
 		return cli.NewExitError("", 1)
 	}

+	if c.Bool("no-server") && len(c.StringSlice("remote")) > 0 {
+		// TODO: in this case, we won't be able to tunnel stuff back to
+		// here, so if we're okay with every remote graph running in an
+		// isolated mode, then this is okay. Improve on this if there's
+		// someone who really wants to be able to do this.
+		log.Println("Main: Error: the --no-server and --remote parameters cannot be used together!")
+		return cli.NewExitError("", 1)
+	}
+
+	cConns := uint16(c.Int("cconns"))
+	if cConns < 0 {
+		log.Printf("Main: Error: --cconns should be at least zero!")
+		return cli.NewExitError("", 1)
+	}
+
 	var wg sync.WaitGroup
 	exit := make(chan bool) // exit signal
 	var G, fullGraph *Graph
@@ -244,6 +259,21 @@ func run(c *cli.Context) error {
 		}
 	}()

+	// build remotes struct for remote ssh
+	remotes := NewRemotes(
+		EmbdEtcd.LocalhostClientURLs().StringSlice(),
+		[]string{DefaultClientURL},
+		noop,
+		c.StringSlice("remote"), // list of files
+		cConns,
+		c.Bool("allow-interactive"),
+		c.String("ssh-priv-id-rsa"),
+	)
+
+	// TODO: is there any benefit to running the remotes above in the loop?
+	// wait for etcd to be running before we remote in, which we do above!
+	go remotes.Run()
+
 	if !c.IsSet("file") && !c.IsSet("puppet") {
 		converger.Start() // better start this for empty graphs
 	}
@@ -253,6 +283,8 @@ func run(c *cli.Context) error {

 	log.Println("Destroy...")

+	remotes.Exit() // tell all the remote connections to shutdown; waits!
+
 	G.Exit() // tell all the children to exit

 	// tell inner main loop to exit
@@ -398,6 +430,27 @@ func main() {
 					Value: "",
 					Usage: "supply the path to an alternate puppet.conf file to use",
 				},
+				cli.StringSliceFlag{
+					Name:  "remote",
+					Value: &cli.StringSlice{},
+					Usage: "list of remote graph definitions to run",
+				},
+				cli.BoolFlag{
+					Name:  "allow-interactive",
+					Usage: "allow interactive prompting, such as for remote passwords",
+				},
+				cli.StringFlag{
+					Name:   "ssh-priv-id-rsa",
+					Value:  "~/.ssh/id_rsa",
+					Usage:  "default path to ssh key file, set empty to never touch",
+					EnvVar: "MGMT_SSH_PRIV_ID_RSA",
+				},
+				cli.IntFlag{
+					Name:   "cconns",
+					Value:  0,
+					Usage:  "number of maximum concurrent remote ssh connections to run, 0 for unlimited",
+					EnvVar: "MGMT_CCONNS",
+				},
 			},
 		},
 	}
--- a/remote.go
+++ b/remote.go
@@ -0,0 +1,868 @@
+// Mgmt
+// Copyright (C) 2013-2016+ James Shubin and the project contributors
+// Written by James Shubin <james@shubin.ca> and the project contributors
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+// This set of structs and methods are for running mgmt remotely over SSH. This
+// gives us the architectural robustness of our current design, combined with
+// the ability to run it with an "agent-less" approach for bootstrapping, and
+// in environments with more restrictive installation requirements. In general
+// the following sequence is run:
+//
+//	1) connect to remote host
+//	2) make temporary directory
+//	3) copy over the mgmt binary and graph definition
+//	4) tunnel tcp connections for etcd
+//	5) run it!
+//	6) finish and quit
+//	7) close tunnels
+//	8) clean up
+//	9) disconnect
+//
+// The main advantage of this agent-less approach, is while multiple of these
+// remote mgmt transient agents are running, they can still exchange data and
+// converge together without directly connecting, since they all tunnel through
+// the etcd server running on the initiator.
+package main // TODO: make this a separate ssh package
+
+// TODO: running with two identical remote endpoints over a slow connection, eg:
+// --remote file1.yaml --remote file1.yaml
+// where we ^C when both file copies are running seems to deadlock the process.
+
+import (
+	"bytes"
+	"fmt"
+	"github.com/howeyc/gopass"
+	"github.com/kardianos/osext"
+	"github.com/pkg/sftp"
+	"golang.org/x/crypto/ssh"
+	"io"
+	"io/ioutil"
+	"log"
+	"math/rand"
+	"net"
+	"net/url"
+	"os"
+	"os/user"
+	"path"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+const (
+	// FIXME: should this dir be in /var/ instead?
+	formatPattern                        = "/tmp/mgmt.%s/"                        // remote format, to match `mktemp`
+	formatChars                          = "abcdefghijklmnopqrstuvwxyz0123456789" // chars for fmt string // TODO: what does mktemp use?
+	maxCollisions                        = 13                                     // number of tries to try making a unique remote directory
+	defaultUser                          = "mgmt"                                 // default user
+	defaultPort                   uint16 = 22                                     // default port
+	maxPasswordTries                     = 3                                      // max number of interactive password tries
+	nonInteractivePasswordTimeout        = 5 * 2                                  // five minutes
+)
+
+// The SSH struct is the unit building block for a single remote SSH connection.
+type SSH struct {
+	host string           // remote host to connect to
+	port uint16           // remote port to connect to (usually 22)
+	user string           // username to connect with
+	auth []ssh.AuthMethod // list of auth for ssh
+
+	file       string   // the graph definition file to run
+	clientURLs []string // list of urls where the local server is listening
+	remoteURLs []string // list of urls where the remote server connects to
+	noop       bool     // whether to run the remote process with --noop
+
+	client   *ssh.Client  // client object
+	sftp     *sftp.Client // sftp object
+	listener net.Listener // remote listener
+	session  *ssh.Session // session for exec
+	f1       *os.File     // file object for SftpCopy source
+	f2       *sftp.File   // file object for SftpCopy destination
+
+	wg      sync.WaitGroup // sync group for tunnel go routines
+	lock    sync.Mutex     // mutex to avoid exit races
+	exiting bool           // flag to let us know if we're exiting
+
+	remotewd string // path to remote working directory
+	execpath string // path to remote mgmt binary
+	filepath string // path to remote file config
+}
+
+// Connect kicks off the SSH connection.
+func (obj *SSH) Connect() error {
+	config := &ssh.ClientConfig{
+		User: obj.user,
+		// you must pass in at least one implementation of AuthMethod
+		Auth: obj.auth,
+	}
+	var err error
+	obj.client, err = ssh.Dial("tcp", fmt.Sprintf("%s:%d", obj.host, obj.port), config)
+	if err != nil {
+		return fmt.Errorf("Can't dial: %s", err.Error()) // Error() returns a string
+	}
+	return nil
+}
+
+// Close cleans up after the main SSH connection.
+func (obj *SSH) Close() error {
+	if obj.client == nil {
+		return nil
+	}
+	return obj.client.Close()
+}
+
+// The Sftp function uses the sftp protocol to create a remote dir and copy over
+// the binary to run. On error the string represents the path to the remote dir.
+func (obj *SSH) Sftp() error {
+	var err error
+
+	if obj.client == nil {
+		return fmt.Errorf("Not dialed!")
+	}
+	// this check is needed because the golang path.Base function is weird!
+	if strings.HasSuffix(obj.file, "/") {
+		return fmt.Errorf("File must not be a directory.")
+	}
+
+	// we run local operations first so that remote clean up is easier...
+	selfpath := ""
+	if selfpath, err = osext.Executable(); err != nil {
+		return fmt.Errorf("Can't get executable path: %v", err)
+	}
+	log.Printf("Remote: Self executable is: %s", selfpath)
+
+	// this calls NewSession and does everything in its own session :)
+	obj.sftp, err = sftp.NewClient(obj.client)
+	if err != nil {
+		return err
+	}
+
+	// TODO: make the path configurable to deal with /tmp/ mounted noexec?
+	obj.remotewd = ""
+	for i := 0; true; {
+		// NOTE: since fmtUUID is deterministic, if we don't clean up
+		// previous runs, we may get the same paths generated, and here
+		// they will conflict.
+		obj.remotewd = fmt.Sprintf(formatPattern, fmtUUID(10)) // eg: /tmp/mgmt.abcdefghij/
+		if err := obj.sftp.Mkdir(obj.remotewd); err != nil {
+			i++ // count number of times we've tried
+			e := fmt.Errorf("Can't make tmp directory: %s", err)
+			log.Println(e)
+			if i >= maxCollisions {
+				log.Printf("Remote: Please clean up the remote dir: %s", obj.remotewd)
+				return e
+			}
+			continue // try again, unlucky conflict!
+		}
+		log.Printf("Remote: Remotely created: %s", obj.remotewd)
+		break
+	}
+
+	// FIXME: consider running a hashing function to check if the remote file
+	// is valid before copying it over again... this would need a deterministic
+	// temp directory location first... this actually happens with fmtUUID!
+	// future patch!
+
+	obj.execpath = path.Join(obj.remotewd, program) // program is a compile time string from main.go
+	log.Printf("Remote: Remote path is: %s", obj.execpath)
+
+	log.Println("Remote: Copying binary, please be patient...")
+	_, err = obj.SftpCopy(selfpath, obj.execpath)
+	if err != nil {
+		// TODO: cleanup
+		return fmt.Errorf("Error copying binary: %s", err)
+	}
+
+	if obj.exitCheck() {
+		return nil
+	}
+
+	// make file executable
+	// TODO: do we want the group or other bits set?
+	if err := obj.sftp.Chmod(obj.execpath, 0770); err != nil {
+		return fmt.Errorf("Can't set file mode bits!")
+	}
+
+	// copy graph file
+	// TODO: should future versions use torrent for this copy and updates?
+	obj.filepath = path.Join(obj.remotewd, path.Base(obj.file)) // same filename
+	log.Println("Remote: Copying graph definition...")
+	_, err = obj.SftpCopy(obj.file, obj.filepath)
+	if err != nil {
+		// TODO: cleanup
+		return fmt.Errorf("Error copying graph: %s", err)
+	}
+
+	return nil
+}
+
+// SftpCopy is a simple helper function that runs a local -> remote sftp copy.
+func (obj *SSH) SftpCopy(src, dst string) (int64, error) {
+	if obj.sftp == nil {
+		return -1, fmt.Errorf("Sftp session is not active!")
+	}
+	var err error
+	// TODO: add a check to make sure we don't run two copies of this
+	// function at the same time! they both would use obj.f1 and obj.f2
+
+	obj.f1, err = os.Open(src) // open a handle to read the file
+	if err != nil {
+		return -1, err
+	}
+	defer obj.f1.Close()
+
+	if obj.exitCheck() {
+		return -1, nil
+	}
+
+	obj.f2, err = obj.sftp.Create(dst) // open a handle to create the file
+	if err != nil {
+		return -1, err
+	}
+	defer obj.f2.Close()
+
+	if obj.exitCheck() {
+		return -1, nil
+	}
+
+	// the actual copy, this might take time...
+	n, err := io.Copy(obj.f2, obj.f1) // dst, src -> n, error
+	if err != nil {
+		return n, fmt.Errorf("Can't copy to remote path: %v", err)
+	}
+	if n <= 0 {
+		return n, fmt.Errorf("Zero bytes copied!")
+	}
+	return n, nil
+}
+
+// SftpClean cleans up the mess and closes the connection from the sftp work.
+func (obj *SSH) SftpClean() error {
+	if obj.sftp == nil {
+		return nil
+	}
+
+	// TODO: if this runs before we ever use f1 or f2 it could be a panic!
+	// TODO: fix this possible? panic if we ever end up caring about it...
+	// close any copy operations that are in progress...
+	obj.f1.Close() // TODO: we probably only need to shutdown one of them,
+	obj.f2.Close() // but which one should we shutdown? close both for now
+
+	// clean up the graph definition in obj.remotewd
+	err := obj.sftp.Remove(obj.filepath)
+
+	// TODO: add binary caching
+	if e := obj.sftp.Remove(obj.execpath); e != nil {
+		err = e
+	}
+	if e := obj.sftp.Remove(obj.remotewd); e != nil {
+		err = e
+	}
+
+	if e := obj.sftp.Close(); e != nil {
+		err = e
+	}
+
+	// TODO: return all errors when we have a better error struct
+	return err
+}
+
+// Tunnel initiates the reverse SSH tunnel. You can .Wait() on the returned
+// sync WaitGroup to know when the tunnels have closed completely.
+func (obj *SSH) Tunnel() error {
+	var err error
+
+	if len(obj.clientURLs) < 1 {
+		return fmt.Errorf("Need at least one client URL to tunnel!")
+	}
+	if len(obj.remoteURLs) < 1 {
+		return fmt.Errorf("Need at least one remote URL to tunnel!")
+	}
+
+	// TODO: do something less arbitrary about which one we pick?
+	url := cleanURL(obj.remoteURLs[0]) // arbitrarily pick the first one
+	// reverse `ssh -R` listener to listen on the remote host
+	obj.listener, err = obj.client.Listen("tcp", url) // remote
+	if err != nil {
+		return fmt.Errorf("Can't listen on remote host: %s", err)
+	}
+
+	obj.wg.Add(1)
+	go func() {
+		defer obj.wg.Done()
+		for {
+			conn, err := obj.listener.Accept()
+			if err != nil {
+				// a Close() will trigger an EOF "error" here!
+				if err == io.EOF {
+					return
+				}
+				log.Printf("Remote: Error accepting on remote host: %s", err)
+				return // FIXME: return or continue?
+			}
+			// XXX: pass in wg to this method and to its children?
+			if f := obj.forward(conn); f != nil {
+				// TODO: is this correct?
+				defer f.Close() // close the remote connection
+			} else {
+				// TODO: is this correct?
+				// close the listener since it is useless now
+				obj.listener.Close()
+			}
+		}
+	}()
+	return nil
+}
+
+// forward is a helper function to make the tunnelling code more readable.
+func (obj *SSH) forward(remoteConn net.Conn) net.Conn {
+	// TODO: validate URL format?
+	// TODO: do something less arbitrary about which one we pick?
+	url := cleanURL(obj.clientURLs[0])     // arbitrarily pick the first one
+	localConn, err := net.Dial("tcp", url) // local
+	if err != nil {
+		log.Printf("Remote: Local dial error: %s", err)
+		return nil // seen as an error...
+	}
+
+	cp := func(writer, reader net.Conn) {
+		// Copy copies from src to dst until either EOF is reached on
+		// src or an error occurs. It returns the number of bytes copied
+		// and the first error encountered while copying, if any.
+		// Note: src & dst are backwards in golang as compared to cp, lol!
+		n, err := io.Copy(writer, reader) // from reader to writer
+		if err != nil {
+			log.Printf("Remote: io.Copy error: %s", err)
+			// FIXME: what should we do here???
+		}
+		if DEBUG {
+			log.Printf("Remote: io.Copy finished: %d", n)
+		}
+	}
+	go cp(remoteConn, localConn)
+	go cp(localConn, remoteConn)
+
+	return localConn // success!
+}
+
+// TunnelClose causes any currently connected Tunnel to shutdown.
+func (obj *SSH) TunnelClose() error {
+	if obj.listener != nil {
+		err := obj.listener.Close()
+		obj.wg.Wait() // wait for everyone to close
+		obj.listener = nil
+		return err
+	}
+	return nil
+}
+
+// Exec runs the binary on the remote server.
+func (obj *SSH) Exec() error {
+	if obj.execpath == "" {
+		return fmt.Errorf("Must have a binary path to execute!")
+	}
+	if obj.filepath == "" {
+		return fmt.Errorf("Must have a graph definition to run!")
+	}
+
+	var err error
+	obj.session, err = obj.client.NewSession()
+	if err != nil {
+		return fmt.Errorf("Failed to create session: %s", err.Error())
+	}
+	defer obj.session.Close()
+
+	var b combinedWriter
+	obj.session.Stdout = &b
+	obj.session.Stderr = &b
+
+	// TODO: do something less arbitrary about which one we pick?
+	url := cleanURL(obj.remoteURLs[0])                           // arbitrarily pick the first one
+	seeds := fmt.Sprintf("--no-server --seeds 'http://%s'", url) // XXX: escape dangerous untrusted input?
+	file := fmt.Sprintf("--file '%s'", obj.filepath)             // XXX: escape dangerous untrusted input!
+	args := []string{seeds, file}
+	if obj.noop {
+		args = append(args, "--noop")
+	}
+
+	// TODO: add --converged-timeout support for group
+
+	cmd := fmt.Sprintf("%s run %s", obj.execpath, strings.Join(args, " "))
+	log.Printf("Remote: Running: %s", cmd)
+	if err := obj.session.Run(cmd); err != nil {
+		// The returned error is nil if the command runs, has no
+		// problems copying stdin, stdout, and stderr, and exits with a
+		// zero exit status. If the remote server does not send an exit
+		// status, an error of type *ExitMissingError is returned. If
+		// the command completes unsuccessfully or is interrupted by a
+		// signal, the error is of type *ExitError. Other error types
+		// may be returned for I/O problems.
+		if e, ok := err.(*ssh.ExitError); ok {
+			if sig := e.Waitmsg.Signal(); sig != "" {
+				log.Printf("Remote: Exit signal: %s", sig)
+			}
+			log.Printf("Remote: Error: Output...\n%s", b.PrefixedString("|\t"))
+			return fmt.Errorf("Exited (%d) with: %s", e.Waitmsg.ExitStatus(), e.Error())
+
+		} else if e, ok := err.(*ssh.ExitMissingError); ok {
+			return fmt.Errorf("Exit code missing: %s", e.Error())
+		}
+		// TODO: catch other types of errors here...
+		return fmt.Errorf("Failed for unknown reason: %s", err.Error())
+	}
+	log.Printf("Remote: Output...\n%s", b.PrefixedString("|\t"))
+	return nil
+}
+
+// simpleRun is a simple helper for running commands in new sessions.
+func (obj *SSH) simpleRun(cmd string) error {
+	session, err := obj.client.NewSession() // not the main session!
+	if err != nil {
+		return fmt.Errorf("Failed to create session: %s", err.Error())
+	}
+	defer session.Close()
+	if err := session.Run(cmd); err != nil {
+		return fmt.Errorf("Error running command: %s", err)
+	}
+	return nil
+}
+
+// ExecExit sends a SIGINT (^C) signal to the remote process, and waits for the
+// process to exit.
+func (obj *SSH) ExecExit() error {
+	if obj.session == nil {
+		return nil
+	}
+	// Signal sends the given signal to the remote process.
+	// FIXME: this doesn't work, see: https://github.com/golang/go/issues/16597
+	// FIXME: additionally, a disconnect leaves the remote process running! :(
+	if err := obj.session.Signal(ssh.SIGINT); err != nil {
+		log.Printf("Remote: Signal: Error: %s", err)
+	}
+
+	// FIXME: workaround: force a signal!
+	if err := obj.simpleRun(fmt.Sprintf("killall -SIGINT %s", program)); err != nil { // FIXME: low specificity
+		log.Printf("Remote: Failed to send SIGINT: %s", err.Error())
+	}
+
+	// emergency timeout...
+	go func() {
+		// try killing the process more violently
+		time.Sleep(10 * time.Second)
+		//obj.session.Signal(ssh.SIGKILL)
+		cmd := fmt.Sprintf("killall -SIGKILL %s", program) // FIXME: low specificity
+		obj.simpleRun(cmd)
+	}()
+
+	// FIXME: workaround: wait (spin lock) until process quits cleanly...
+	cmd := fmt.Sprintf("while killall -0 %s 2> /dev/null; do sleep 1s; done", program) // FIXME: low specificity
+	if err := obj.simpleRun(cmd); err != nil {
+		return fmt.Errorf("Error waiting: %s", err)
+	}
+
+	return nil
+}
+
+// Go kicks off the entire sequence of one SSH connection.
+func (obj *SSH) Go() error {
+	if obj.exitCheck() {
+		return nil
+	}
+
+	// connect
+	log.Println("Remote: Connect...")
+	if err := obj.Connect(); err != nil {
+		return fmt.Errorf("Remote: SSH errored with: %v", err)
+	}
+	defer obj.Close()
+
+	if obj.exitCheck() {
+		return nil
+	}
+
+	// sftp
+	log.Println("Remote: Sftp...")
+	defer obj.SftpClean()
+	if err := obj.Sftp(); err != nil {
+		return fmt.Errorf("Remote: Sftp errored with: %v", err)
+	}
+
+	if obj.exitCheck() {
+		return nil
+	}
+
+	// tunnel
+	log.Println("Remote: Tunnelling...")
+	if err := obj.Tunnel(); err != nil { // non-blocking
+		log.Printf("Remote: Tunnel errored with: %v", err)
+		return err
+	}
+	defer obj.TunnelClose()
+
+	if obj.exitCheck() {
+		return nil
+	}
+
+	// exec
+	log.Println("Remote: Exec...")
+	if err := obj.Exec(); err != nil {
+		log.Printf("Remote: Exec errored with: %v", err)
+		return err
+	}
+
+	log.Println("Remote: Done!")
+	return nil
+}
+
+// exitCheck is a helper function which stops additional stages from running if
+// we detect that a Stop() action has been called.
+func (obj *SSH) exitCheck() bool {
+	obj.lock.Lock()
+	defer obj.lock.Unlock()
+	if obj.exiting {
+		return true // prevent from continuing to the next stage
+	}
+	return false
+}
+
+// Stop shuts down any SSH in progress as safely and quickly as possible.
+func (obj *SSH) Stop() error {
+	obj.lock.Lock()
+	obj.exiting = true // don't spawn new steps once this flag is set!
+	obj.lock.Unlock()
+
+	// TODO: return all errors when we have a better error struct
+	var e error
+	// go through each stage in reverse order and request an exit
+	if err := obj.ExecExit(); e == nil && err != nil { // waits for program to exit
+		e = err
+	}
+	if err := obj.TunnelClose(); e == nil && err != nil {
+		e = err
+	}
+
+	// TODO: match errors due to stop signal and ignore them!
+	if err := obj.SftpClean(); e == nil && err != nil {
+		e = err
+	}
+	if err := obj.Close(); e == nil && err != nil {
+		e = err
+	}
+	return e
+}
+
+// The Remotes struct manages a set of SSH connections.
+// TODO: rename this to something more logical
+type Remotes struct {
+	clientURLs   []string // list of urls where the local server is listening
+	remoteURLs   []string // list of urls where the remote server connects to
+	noop         bool     // whether to run in noop mode
+	remotes      []string // list of remote graph definition files to run
+	cConns       uint16   // number of concurrent ssh connections, zero means unlimited
+	interactive  bool     // allow interactive prompting
+	sshPrivIdRsa string   // path to ~/.ssh/id_rsa
+
+	wg        sync.WaitGroup  // keep track of each running SSH connection
+	lock      sync.Mutex      // mutex for access to sshmap
+	sshmap    map[string]*SSH // map to each SSH struct with the remote as the key
+	exiting   bool            // flag to let us know if we're exiting
+	semaphore Semaphore       // counting semaphore to limit concurrent connections
+}
+
+// The NewRemotes function builds a Remotes struct.
+func NewRemotes(clientURLs, remoteURLs []string, noop bool, remotes []string, cConns uint16, interactive bool, sshPrivIdRsa string) *Remotes {
+	return &Remotes{
+		clientURLs:   clientURLs,
+		remoteURLs:   remoteURLs,
+		noop:         noop,
+		remotes:      remotes,
+		cConns:       cConns,
+		interactive:  interactive,
+		sshPrivIdRsa: sshPrivIdRsa,
+		sshmap:       make(map[string]*SSH),
+		semaphore:    NewSemaphore(int(cConns)),
+	}
+}
+
+// NewSSH is a helper function that does the initial parsing into an SSH obj.
+// It takes as input the path to a graph definition file.
+func (obj *Remotes) NewSSH(file string) (*SSH, error) {
+	// first do the parsing...
+	config := ParseConfigFromFile(file)
+	if config == nil {
+		return nil, fmt.Errorf("Remote: Error parsing remote graph: %s", file)
+	}
+	if config.Remote == "" {
+		return nil, fmt.Errorf("Remote: No remote endpoint in the graph: %s", file)
+	}
+
+	// do the url parsing...
+	u, err := url.Parse(config.Remote)
+	if err != nil {
+		return nil, err
+	}
+	if u.Scheme != "" && u.Scheme != "ssh" {
+		return nil, fmt.Errorf("Unknown remote scheme: %s", u.Scheme)
+	}
+
+	host := ""
+	port := defaultPort // default
+	x := strings.Split(u.Host, ":")
+	if c := len(x); c == 0 || c > 2 { // need one or two chunks
+		return nil, fmt.Errorf("Can't parse host pattern: %s", u.Host)
+	} else if c == 2 {
+		v, err := strconv.ParseUint(x[1], 10, 16)
+		if err != nil {
+			return nil, fmt.Errorf("Can't parse port: %s", x[1])
+		}
+		port = uint16(v)
+	}
+	host = x[0]
+	if host == "" {
+		return nil, fmt.Errorf("Empty hostname!")
+	}
+
+	user := defaultUser // default
+	if x := u.User.Username(); x != "" {
+		user = x
+	}
+	auth := []ssh.AuthMethod{}
+	if secret, b := u.User.Password(); b {
+		auth = append(auth, ssh.Password(secret))
+	}
+
+	// get ssh key auth if available
+	if a, err := obj.sshKeyAuth(); err == nil {
+		auth = append(auth, a)
+	}
+
+	// if there are no auth methods available, add interactive to be helpful
+	if len(auth) == 0 || obj.interactive {
+		auth = append(auth, ssh.RetryableAuthMethod(ssh.PasswordCallback(obj.passwordCallback(user, host)), maxPasswordTries))
+	}
+
+	if len(auth) == 0 {
+		return nil, fmt.Errorf("No authentication methods available!")
+	}
+
+	return &SSH{
+		host:       host,
+		port:       port,
+		user:       user,
+		auth:       auth,
+		file:       file,
+		clientURLs: obj.clientURLs,
+		remoteURLs: obj.remoteURLs,
+		noop:       obj.noop,
+	}, nil
+}
+
+// sshKeyAuth is a helper function to get the ssh key auth struct needed
+func (obj *Remotes) sshKeyAuth() (ssh.AuthMethod, error) {
+	if obj.sshPrivIdRsa == "" {
+		return nil, fmt.Errorf("Empty path specified!")
+	}
+	p := ""
+	// TODO: this doesn't match strings of the form: ~james/.ssh/id_rsa
+	if strings.HasPrefix(obj.sshPrivIdRsa, "~/") {
+		usr, err := user.Current()
+		if err != nil {
+			log.Printf("Remote: Can't find home directory automatically.")
+			return nil, err
+		}
+		p = path.Join(usr.HomeDir, obj.sshPrivIdRsa[len("~/"):])
+	}
+	if p == "" {
+		return nil, fmt.Errorf("Empty path specified!")
+	}
+	// A public key may be used to authenticate against the server by using
+	// an unencrypted PEM-encoded private key file. If you have an encrypted
+	// private key, the crypto/x509 package can be used to decrypt it.
+	key, err := ioutil.ReadFile(p)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create the Signer for this private key.
+	signer, err := ssh.ParsePrivateKey(key)
+	if err != nil {
+		return nil, err
+	}
+
+	return ssh.PublicKeys(signer), nil
+}
+
+// passwordCallback is a function which returns the appropriate type of callback.
+func (obj *Remotes) passwordCallback(user, host string) func() (string, error) {
+	timeout := nonInteractivePasswordTimeout // default
+	if obj.interactive {                     // return after a timeout if not interactive
+		timeout = -1 // unlimited when we asked for interactive mode!
+	}
+	cb := func() (string, error) {
+		passchan := make(chan string)
+		failchan := make(chan error)
+
+		go func() {
+			log.Printf("Remote: Prompting for %s@%s password...", user, host)
+			fmt.Printf("Password: ")
+			password, err := gopass.GetPasswd()
+			if err != nil { // on ^C or getch() error
+				// returning an error will cancel the N retries on this
+				failchan <- err
+				return
+			}
+			passchan <- string(password)
+		}()
+
+		// wait for password, but include a timeout if we promiscuously
+		// added the interactive mode
+		select {
+		case p := <-passchan:
+			return p, nil
+		case e := <-failchan:
+			return "", e
+		case <-TimeAfterOrBlock(timeout):
+			return "", fmt.Errorf("Interactive timeout reached!")
+		}
+	}
+	return cb
+}
+
+// The Run method of the Remotes struct kicks it all off. It is usually run from
+// a go routine.
+func (obj *Remotes) Run() {
+	// the semaphore provides the max simultaneous connection limit
+	for _, f := range obj.remotes {
+		if obj.cConns != 0 {
+			obj.semaphore.P(1) // take one
+		}
+		obj.lock.Lock()
+		if obj.exiting {
+			return
+		}
+		sshobj, err := obj.NewSSH(f)
+		if err != nil {
+			log.Printf("Remote: Error: %s", err)
+			continue
+		}
+		obj.sshmap[f] = sshobj // save a reference
+
+		obj.wg.Add(1)
+		go func() {
+			if obj.cConns != 0 {
+				defer obj.semaphore.V(1)
+			}
+			defer obj.wg.Done()
+			if err := sshobj.Go(); err != nil {
+				log.Printf("Remote: Error: %s", err)
+			}
+		}()
+		obj.lock.Unlock()
+	}
+}
+
+// The Exit method causes as much of the Remotes struct to shutdown as quickly
+// and as cleanly as possible. It only returns once everything is shutdown.
+func (obj *Remotes) Exit() {
+	obj.lock.Lock()
+	obj.exiting = true // don't spawn new ones once this flag is set!
+	obj.lock.Unlock()
+	for _, f := range obj.remotes {
+		sshobj, exists := obj.sshmap[f]
+		if !exists || sshobj == nil {
+			continue
+		}
+
+		// TODO: should we run these as go routines?
+		if err := sshobj.Stop(); err != nil {
+			log.Printf("Remote: Error stopping: %s", err)
+		}
+	}
+
+	obj.wg.Wait() // wait for everyone to exit
+}
+
+// fmtUUID makes a random string of length n, it is not cryptographically safe.
+// This function actually usually generates the same sequence of random strings
+// each time the program is run, which makes repeatability of this code easier.
+func fmtUUID(n int) string {
+	b := make([]byte, n)
+	for i := range b {
+		b[i] = formatChars[rand.Intn(len(formatChars))]
+	}
+	return string(b)
+}
+
+// cleanURL removes the scheme and leaves just the host:port combination.
+func cleanURL(s string) string {
+	x := s
+	if !strings.Contains(s, "://") {
+		x = "ssh://" + x
+	}
+	// the url.Parse returns "" for u.Host if given "hostname:22" as input.
+	u, err := url.Parse(x)
+	if err != nil {
+		return ""
+	}
+	return u.Host
+}
+
+// Semaphore is a counting semaphore.
+type Semaphore chan struct{}
+
+func NewSemaphore(size int) Semaphore {
+	return make(Semaphore, size)
+}
+
+// P acquires n resources.
+func (s Semaphore) P(n int) {
+	e := struct{}{}
+	for i := 0; i < n; i++ {
+		s <- e // acquire one
+	}
+}
+
+// V releases n resources.
+func (s Semaphore) V(n int) {
+	for i := 0; i < n; i++ {
+		<-s // release one
+	}
+}
+
+// combinedWriter mimics what the ssh.CombinedOutput command does.
+type combinedWriter struct {
+	b  bytes.Buffer
+	mu sync.Mutex
+}
+
+// The Write method writes to the bytes buffer with a lock to mix output safely.
+func (w *combinedWriter) Write(p []byte) (int, error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	return w.b.Write(p)
+}
+
+// The String function returns the contents of the buffer.
+func (w *combinedWriter) String() string {
+	return w.b.String()
+}
+
+// The PrefixedString returns the contents of the buffer with the prefix
+// appended to every line.
+func (w *combinedWriter) PrefixedString(prefix string) string {
+	return prefix + strings.TrimSuffix(strings.Replace(w.String(), "\n", "\n"+prefix, -1), prefix)
+}
--- a/test/test-govet.sh
+++ b/test/test-govet.sh
@@ -5,4 +5,4 @@ ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && cd .. && pwd )"	# dir!
 cd "${ROOT}"

 go vet && echo PASS || exit 1	# since it doesn't output an ok message on pass
-grep 'log.' *.go | grep '\\n' && exit 1 || echo PASS	# no \n needed in log.Printf()
+grep 'log.' *.go | grep '\\n"' && exit 1 || echo PASS	# no \n needed in log.Printf()