Files
mgmt/engine/resources/http_server_proxy.go
James Shubin 654e958d3f engine: resources: Add the proper prefix to grouped http resources
Resources that can be grouped into the http:server resource must have
that prefix. Grouping is basically hierarchical, and without that common
prefix, it means we'd have to special-case our grouping algorithm.
2025-05-25 01:40:25 -04:00

628 lines
21 KiB
Go

// Mgmt
// Copyright (C) James Shubin and the project contributors
// Written by James Shubin <james@shubin.ca> and the project contributors
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
//
// Additional permission under GNU GPL version 3 section 7
//
// If you modify this program, or any covered work, by linking or combining it
// with embedded mcl code and modules (and that the embedded mcl code and
// modules which link with this program, contain a copy of their source code in
// the authoritative form) containing parts covered by the terms of any other
// license, the licensors of this program grant you additional permission to
// convey the resulting work. Furthermore, the licensors of this program grant
// the original author, James Shubin, additional permission to update this
// additional permission if he deems it necessary to achieve the goals of this
// additional permission.
package resources
import (
"bytes"
"context"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/purpleidea/mgmt/engine"
"github.com/purpleidea/mgmt/engine/traits"
)
const (
httpServerProxyKind = httpServerKind + ":proxy"
)
var (
// httpServerProxyRWMutex synchronizes against reads and writes to the cache.
// TODO: we could instead have a per-cache path individual mutex, but to
// keep things simple for now, we just lumped them all together.
httpServerProxyRWMutex *sync.RWMutex
)
func init() {
httpServerProxyRWMutex = &sync.RWMutex{}
engine.RegisterResource(httpServerProxyKind, func() engine.Res { return &HTTPServerProxyRes{} })
}
// HTTPServerProxyRes is a resource representing a special path that exists
// within an http server. The name is used as the public path of the endpoint,
// unless the path field is specified, and in that case it is used instead. The
// way this works is that it autogroups at runtime with an existing http server
// resource, and in doing so makes the path associated with this resource
// available when serving files. When something under the path is accessed, this
// is pulled from the backing http server, which makes an http client connection
// if needed to pull the authoritative file down, saves it locally for future
// use, and then returns it to the original http client caller. On a subsequent
// call, if the cache was not invalidated, the file doesn't need to be fetched
// from the network. In effect, this works as a caching http proxy. If you
// create this as a resource which responds to the same type of request as an
// http:server:file resource or any other kind of resource, it is undefined
// behaviour which will answer the request. The most common clash will happen if
// both are present at the same path. This particular implementation stores some
// file data in memory as a convenience instead of streaming directly to
// clients. This makes locking much easier, but is wasteful. If you plan on
// using this for huge files and on systems with low amounts of memory, you
// might want to optimize this. The resultant proxy path is determined by
// subtracting the `Sub` field from the `Path` (and request path) and then
// appending the result to the `Head` field.
type HTTPServerProxyRes struct {
traits.Base // add the base methods without re-implementation
traits.Edgeable // XXX: add autoedge support
traits.Groupable // can be grouped into HTTPServerRes
init *engine.Init
// Server is the name of the http server resource to group this into. If
// it is omitted, and there is only a single http resource, then it will
// be grouped into it automatically. If there is more than one main http
// resource being used, then the grouping behaviour is *undefined* when
// this is not specified, and it is not recommended to leave this blank!
Server string `lang:"server" yaml:"server"`
// Path is the path that this presents as on the grouped http server. It
// overrides the Name var if specified.
Path string `lang:"path" yaml:"path"`
// Sub is the string to remove from the start of the request, the path
// of which is looking at the Name/Path field to see if it matches. If
// it matches, it then translates to the destination server by removing
// this `Sub` string from the start of the path request, and then
// prepending on the `Head` field to get to the resultant proxy URL. If
// this is empty, then nothing is subtracted.
Sub string `lang:"sub" yaml:"sub"`
// Head is the string to add on as a prefix to the new URL we are
// building for the proxy. If this is empty, the proxy can't work, and
// we can only rely on what is available in our local cache. This is
// typically the protocol and hostname for the backing server.
Head string `lang:"head" yaml:"head"`
// Cache is an absolute path to a location on disk where cached files
// can be stored. If this is empty then we will not cache any files.
// TODO: We could add future in-memory stores, a checksum feature, etc
Cache string `lang:"cache" yaml:"cache"`
// TODO: Add tests for the Path+Sub+Head+Cache path math stuff
// TODO: Allow this to support single file proxying and caching
// TODO: Add a TTL param to expire cached downloads
// TODO: Add a max depth param to prevent people creating 1000 deep dirs
// TODO: Allow single-file caching that can then also send/recv it out
// TODO: Add a Force param to let a dir in cache folder get converted to a file or vice-versa
// TODO: Add an alternate API that consumes a "mapping" function instead
// of Sub/Head. Eg: mapping => func($s) { {"/fedora/" => "https://example.com/foo/" }[$s] }
// pathParser is the local instantiation of our parser that gets reused.
pathParser *pathParser
}
// Default returns some sensible defaults for this resource.
func (obj *HTTPServerProxyRes) Default() engine.Res {
return &HTTPServerProxyRes{}
}
// getPath returns the actual path we respond to. When Path is not specified, we
// use the Name.
func (obj *HTTPServerProxyRes) getPath() string {
if obj.Path != "" {
return obj.Path
}
return obj.Name()
}
// serveHTTP is the real implementation of ServeHTTP, but with a more ergonomic
// signature.
func (obj *HTTPServerProxyRes) serveHTTP(ctx context.Context, requestPath string) (handlerFuncError, error) {
// TODO: switch requestPath to use safepath.AbsPath instead of a string
result, err := obj.pathParser.parse(requestPath)
if err != nil {
return nil, err
}
cachePath := result.cachePath
if obj.Cache != "" { // check in the cache...
// TODO: do cache invalidation here
fn, err := obj.getCachedFile(ctx, cachePath)
if err != nil && !os.IsNotExist(err) {
// cache dir is broken, error?
return nil, err
}
if err == nil {
obj.init.Logf("cache: %s", cachePath)
return fn, nil
}
// otherwise, it must be a file not found in cache error...
}
if obj.Head == "" { // we can't download (we can only pull from cache)
return nil, fmt.Errorf("can't proxy") // NOT a 404 error!
}
proxyURL := result.proxyURL
// FIXME: should we be using a different client?
client := http.DefaultClient
request, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil) // (*Request, error)
if err != nil {
return nil, err
}
// TODO: add a progress logf...
obj.init.Logf("get: %s", proxyURL)
return func(w http.ResponseWriter, req *http.Request) error {
// Tell the client right away, that we're working on things...
// TODO: Is this valuable to give us more time to block?
// NOTE: Using this header breaks wget2!
//w.WriteHeader(http.StatusProcessing) // http 102, RFC 2518, 10.1
response, err := client.Do(request) // (*Response, error)
if err != nil {
return err
}
defer response.Body.Close() // free
if response.StatusCode != http.StatusOK { // 200
return &httpError{
//http.StatusText(http.StatusCode)
msg: fmt.Sprintf("bad status: %d", response.StatusCode),
code: http.StatusNotFound, // 404
}
}
// Determine the last-modified time if we can.
modtime := time.Now()
key := http.CanonicalHeaderKey("Last-Modified")
if headers, exists := response.Header[key]; exists && len(headers) > 0 { // []string
text := headers[len(headers)-1] // last element
lastModified, err := http.ParseTime(text) // (time.Time, error)
if err == nil {
modtime = lastModified
}
}
// We stream the download into both the client requesting it
// indirectly through this proxy, and also into the cache if we
// want to store the file. This lets us not OOM on large files.
// This is necessary because otherwise a client would get bored
// of waiting for us to download large files before sending them
// off. One downside is we haven't implemented range requests
// that http.ServeContent did for us magically, and we also
// currently might not do all of the http/header magic it did.
var copyError error
writers := []io.Writer{w} // out to the client
if obj.Cache != "" { // check in the cache...
httpServerProxyRWMutex.Lock()
defer httpServerProxyRWMutex.Unlock()
// store in cachePath
if err := os.MkdirAll(filepath.Dir(cachePath), 0700); err != nil {
return err
}
// store the last modified time if set
// TODO: is there a file last-modified-time precision
// issue if we use this value in a future HTTP
// If-Modified-Since header?
defer func() { // Do this after the file closes
if copyError != nil {
return // skip if we have a copy error
}
if err := os.Chtimes(cachePath, time.Time{}, modtime); err != nil {
// TODO: what do we do here?
obj.init.Logf("could not chtimes: %s", cachePath)
}
}()
// TODO: use ctx
out, err := os.Create(cachePath)
if err != nil {
return err
}
defer out.Close()
writers = append(writers, out)
// TODO: consider doing something like this to stream?
//reader = io.TeeReader(reader, writer)
}
requestPath := req.URL.Path // TODO: is this what we want here?
if !isZeroTime(modtime) { // from: src/net/http/fs.go:setLastModified
w.Header().Set("Last-Modified", modtime.UTC().Format(http.TimeFormat))
}
if obj.init.Debug || true {
obj.init.Logf("got content-length: %d bytes", response.ContentLength)
}
if size := response.ContentLength; size > 0 {
w.Header().Set("Content-Length", strconv.FormatInt(size, 10))
}
// Use the file's extension to find the ctype, skip if we would
// need to seek into the file to read some header content bytes.
// TODO: Is this sufficient for setting Content-Type ?
if ctype := mime.TypeByExtension(filepath.Ext(requestPath)); ctype != "" {
w.Header().Set("Content-Type", ctype)
}
w.WriteHeader(http.StatusOK) // Tell client everything is ok.
//if progressBar {
// writers = append(writers, progress)
//}
writer := io.MultiWriter(writers...)
if len(writers) == 1 {
writer = writers[0] // simplify
}
if _, err := io.Copy(writer, response.Body); err != nil {
if obj.Cache != "" { // check in the cache...
// We already took the mutex earlier!
obj.init.Logf("removing a partial file: %s", cachePath)
if err := os.Remove(cachePath); err != nil {
// TODO: what do we do here?
obj.init.Logf("could not remove: %s", cachePath)
}
}
// Even if we have an error, it's too late to error.
//return err // superfluous response.WriteHeader call
copyError = err // store for defer
return nil
}
//obj.init.Logf("%d bytes sent", n) // XXX: how do we know (on the server-side) if it worked?
return nil
}, nil
}
// getCachedFile pulls a file from our local cache if it exists. It returns the
// correct http handler on success, which we can then run.
func (obj *HTTPServerProxyRes) getCachedFile(ctx context.Context, absPath string) (handlerFuncError, error) {
// TODO: if infinite reads keep coming in, do we indefinitely-postpone
// the locking so that a new file can be saved in the cache?
httpServerProxyRWMutex.RLock()
defer httpServerProxyRWMutex.RUnlock()
f, err := os.Open(absPath)
if err != nil {
return nil, err
}
defer f.Close() // ignore error
// Determine the last-modified time if we can.
modtime := time.Now()
fi, err := f.Stat()
if err == nil {
modtime = fi.ModTime()
}
// TODO: if Stat errors, should we fail the whole thing?
data, err := io.ReadAll(f)
if err != nil {
return nil, err
}
handle := bytes.NewReader(data) // buffer for mutex
return func(w http.ResponseWriter, req *http.Request) error {
requestPath := req.URL.Path // TODO: is this what we want here?
http.ServeContent(w, req, requestPath, modtime, handle)
//obj.init.Logf("%d bytes sent", n) // XXX: how do we know (on the server-side) if it worked?
return nil
}, nil
}
// ParentName is used to limit which resources autogroup into this one. If it's
// empty then it's ignored, otherwise it must match the Name of the parent to
// get grouped.
func (obj *HTTPServerProxyRes) ParentName() string {
return obj.Server
}
// AcceptHTTP determines whether we will respond to this request. Return nil to
// accept, or any error to pass.
func (obj *HTTPServerProxyRes) AcceptHTTP(req *http.Request) error {
requestPath := req.URL.Path // TODO: is this what we want here?
if p := obj.getPath(); strings.HasSuffix(p, "/") { // a dir!
if strings.HasPrefix(requestPath, p) {
// relative dir root
return nil
}
}
if requestPath != obj.getPath() {
return fmt.Errorf("unhandled path")
}
return nil
}
// ServeHTTP is the standard HTTP handler that will be used here.
func (obj *HTTPServerProxyRes) ServeHTTP(w http.ResponseWriter, req *http.Request) {
// We only allow GET at the moment.
if req.Method != http.MethodGet {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
// XXX: does this cancel when our resource does?
ctx := req.Context() // context.Context
requestPath := req.URL.Path // TODO: is this what we want here?
// TODO: use safepath instead
//absPath, err := safepath.ParseIntoAbsPath(requestPath)
//if err != nil {
// obj.init.Logf("invalid input path: %s", requestPath)
// sendHTTPError(w, err)
// return
//}
//fn, err := obj.serveHTTP(ctx, absPath)
fn, err := obj.serveHTTP(ctx, requestPath)
if err != nil {
obj.init.Logf("error: %s", err)
sendHTTPError(w, err)
return
}
if err := fn(w, req); err != nil {
obj.init.Logf("error: %s", err)
sendHTTPError(w, err)
return
}
//obj.init.Logf("%d bytes sent", n) // XXX: how do we know (on the server-side) if it worked?
}
// Validate checks if the resource data structure was populated correctly.
func (obj *HTTPServerProxyRes) Validate() error {
if obj.getPath() == "" {
return fmt.Errorf("empty filename")
}
// FIXME: does getPath need to start with a slash?
if !strings.HasPrefix(obj.getPath(), "/") {
return fmt.Errorf("the path must be absolute")
}
if obj.Sub != "" && (!strings.HasPrefix(obj.Sub, "/") || !strings.HasSuffix(obj.Sub, "/")) {
return fmt.Errorf("the Sub field must be either empty or an absolute dir prefix")
}
if obj.Head != "" {
if !strings.HasSuffix(obj.Head, "/") {
return fmt.Errorf("the Head must end with a slash")
}
if _, err := url.Parse(obj.Head); err != nil {
return err
}
}
if obj.Cache != "" && (!strings.HasPrefix(obj.Cache, "/") || !strings.HasSuffix(obj.Cache, "/")) {
return fmt.Errorf("the Cache field must be either empty or an absolute dir")
}
return nil
}
// Init runs some startup code for this resource.
func (obj *HTTPServerProxyRes) Init(init *engine.Init) error {
obj.init = init // save for later
obj.pathParser = &pathParser{
path: obj.getPath(),
sub: obj.Sub,
head: obj.Head, // mirror
cache: obj.Cache,
}
return nil
}
// Cleanup is run by the engine to clean up after the resource is done.
func (obj *HTTPServerProxyRes) Cleanup() error {
return nil
}
// Watch is the primary listener for this resource and it outputs events. This
// particular one does absolutely nothing but block until we've received a done
// signal.
func (obj *HTTPServerProxyRes) Watch(ctx context.Context) error {
obj.init.Running() // when started, notify engine that we're running
select {
case <-ctx.Done(): // closed by the engine to signal shutdown
}
//obj.init.Event() // notify engine of an event (this can block)
return nil
}
// CheckApply never has anything to do for this resource, so it always succeeds.
func (obj *HTTPServerProxyRes) CheckApply(ctx context.Context, apply bool) (bool, error) {
if obj.init.Debug {
obj.init.Logf("CheckApply")
}
return true, nil // always succeeds, with nothing to do!
}
// Cmp compares two resources and returns an error if they are not equivalent.
func (obj *HTTPServerProxyRes) Cmp(r engine.Res) error {
// we can only compare HTTPServerProxyRes to others of the same resource kind
res, ok := r.(*HTTPServerProxyRes)
if !ok {
return fmt.Errorf("res is not the same kind")
}
if obj.Server != res.Server {
return fmt.Errorf("the Server field differs")
}
if obj.Path != res.Path {
return fmt.Errorf("the Path differs")
}
if obj.Sub != res.Sub {
return fmt.Errorf("the Sub differs")
}
if obj.Head != res.Head {
return fmt.Errorf("the Head differs")
}
if obj.Cache != res.Cache {
return fmt.Errorf("the Cache differs")
}
return nil
}
// UnmarshalYAML is the custom unmarshal handler for this struct. It is
// primarily useful for setting the defaults.
func (obj *HTTPServerProxyRes) UnmarshalYAML(unmarshal func(interface{}) error) error {
type rawRes HTTPServerProxyRes // indirection to avoid infinite recursion
def := obj.Default() // get the default
res, ok := def.(*HTTPServerProxyRes) // put in the right format
if !ok {
return fmt.Errorf("could not convert to HTTPServerProxyRes")
}
raw := rawRes(*res) // convert; the defaults go here
if err := unmarshal(&raw); err != nil {
return err
}
*obj = HTTPServerProxyRes(raw) // restore from indirection with type conversion!
return nil
}
// pathResult is the output resultant paths.
type pathResult struct {
proxyURL string
cachePath string
}
// pathParser is the input data that is used by parse.
type pathParser struct {
path string // obj.getPath() (name field)
sub string // obj.Sub (???)
head string // obj.Head (mirror)
cache string // obj.Cache (destination dir)
}
// parse handles the logic of transforming the input settings and data into the
// resultant paths. It is implemented as a standalone function to facilitate
// testing.
func (obj *pathParser) parse(requestPath string) (*pathResult, error) {
if !strings.HasPrefix(requestPath, "/") {
return nil, fmt.Errorf("request was not absolute") // unexpected!
}
tail := "" // !isDir
if strings.HasSuffix(requestPath, "/") {
tail = "/" // isDir
// TODO: can we handle paths that look like dirs?
}
// check for ../ type stuff?
if requestPath+tail != filepath.Clean(requestPath) {
return nil, fmt.Errorf("request was not clean")
}
// request path must have our configured path as a base
if !strings.HasPrefix(requestPath, obj.path) {
return nil, newHTTPError(http.StatusNotFound) // 404
}
// Does our request path (which we now know has the same prefix as
// obj.path) have sub as a prefix? If not, we can't sub it and we error.
if obj.sub != "" && !strings.HasPrefix(requestPath, obj.sub) {
return nil, newHTTPError(http.StatusNotFound) // 404
}
// start building new proxyURL and cachePath
tailPath := strings.TrimPrefix(requestPath, obj.sub) // relFile or relDir (if we get a dir-like requestPath)
proxyURL := obj.head + tailPath // good!
result := &pathResult{
proxyURL: proxyURL,
//cachePath: cachePath,
}
if obj.head == "" { // we might _only_ have a cache and no proxyURL
result = nil
}
if obj.cache != "" {
if !strings.HasPrefix(obj.cache, "/") {
return result, fmt.Errorf("cache was not absolute")
}
if !strings.HasSuffix(obj.cache, "/") {
return result, fmt.Errorf("cache was not a dir")
}
}
if result == nil { // we have a cachePath, but no proxyURL
result = &pathResult{}
}
rel := strings.TrimPrefix(requestPath, obj.path) // we already checked
result.cachePath = obj.cache + rel
return result, nil
}
// handlerFuncError is http.HandlerFunc, but with a more ergonomic signature.
type handlerFuncError func(http.ResponseWriter, *http.Request) error
// isZeroTime reports whether t is obviously unspecified (either zero or
// Unix()=0). Adapted from: src/net/http/fs.go
func isZeroTime(t time.Time) bool {
var unixEpochTime = time.Unix(0, 0)
return t.IsZero() || t.Equal(unixEpochTime)
}