Add configurable file-existence and HTTP health checks
Add a section to the config file called "health". Within this section, "filecheckers" and "httpcheckers" list checks to run. Each check specifies a file or URI, a time interval for the check, and a threshold specifying how many times the check must fail to reach an unhealthy state. Document the new options in docs/configuration.md. Add unit testing for both types of checkers. Add an UnregisterAll function in the health package to support the unit tests, and an Unregister function for consistency with Register. Fix a string conversion problem in the health package's HTTP checker. Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
parent
e4b93d1e6d
commit
b09b0ffcf9
@ -135,6 +135,8 @@ type Configuration struct {
|
||||
} `yaml:"pool,omitempty"`
|
||||
} `yaml:"redis,omitempty"`
|
||||
|
||||
Health Health `yaml:"health,omitempty"`
|
||||
|
||||
Proxy Proxy `yaml:"proxy,omitempty"`
|
||||
}
|
||||
|
||||
@ -179,6 +181,37 @@ type MailOptions struct {
|
||||
To []string `yaml:"to,omitempty"`
|
||||
}
|
||||
|
||||
// FileChecker is a type of entry in the checkers section for checking files
|
||||
type FileChecker struct {
|
||||
// Interval is the number of seconds in between checks
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
// File is the path to check
|
||||
File string `yaml:"file,omitempty"`
|
||||
// Threshold is the number of times a check must fail to trigger an
|
||||
// unhealthy state
|
||||
Threshold int `yaml:"threshold,omitempty"`
|
||||
}
|
||||
|
||||
// HTTPChecker is a type of entry in the checkers section for checking HTTP
|
||||
// URIs
|
||||
type HTTPChecker struct {
|
||||
// Interval is the number of seconds in between checks
|
||||
Interval time.Duration `yaml:"interval,omitempty"`
|
||||
// URI is the HTTP URI to check
|
||||
URI string `yaml:"uri,omitempty"`
|
||||
// Threshold is the number of times a check must fail to trigger an
|
||||
// unhealthy state
|
||||
Threshold int `yaml:"threshold,omitempty"`
|
||||
}
|
||||
|
||||
// Health provides the configuration section for health checks.
|
||||
type Health struct {
|
||||
// FileChecker is a list of paths to check
|
||||
FileCheckers []FileChecker `yaml:"file,omitempty"`
|
||||
// HTTPChecker is a list of URIs to check
|
||||
HTTPCheckers []HTTPChecker `yaml:"http,omitempty"`
|
||||
}
|
||||
|
||||
// v0_1Configuration is a Version 0.1 Configuration struct
|
||||
// This is currently aliased to Configuration, as it is the current version
|
||||
type v0_1Configuration Configuration
|
||||
|
@ -195,6 +195,15 @@ information about each option that appears later in this page.
|
||||
maxidle: 16
|
||||
maxactive: 64
|
||||
idletimeout: 300s
|
||||
health:
|
||||
file:
|
||||
- file: /path/to/checked/file
|
||||
interval: 10s
|
||||
threshold: 3
|
||||
http:
|
||||
- uri: http://server.to.check/must/return/200
|
||||
interval: 10s
|
||||
threshold: 3
|
||||
|
||||
In some instances a configuration option is **optional** but it contains child
|
||||
options marked as **required**. This indicates that you can omit the parent with
|
||||
@ -1588,6 +1597,141 @@ Configure the behavior of the Redis connection pool.
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## health
|
||||
|
||||
health:
|
||||
file:
|
||||
- file: /path/to/checked/file
|
||||
interval: 10s
|
||||
threshold: 3
|
||||
http:
|
||||
- uri: http://server.to.check/must/return/200
|
||||
interval: 10s
|
||||
threshold: 3
|
||||
|
||||
The health option is **optional**. It may contain lists of file checkers
|
||||
and/or HTTP checkers.
|
||||
|
||||
### file
|
||||
|
||||
file is a list of paths to be periodically checked for the existence of a file.
|
||||
If a file exists at the given path, the health check will fail. This can be
|
||||
used as a way of bringing a registry out of rotation by creating a file.
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>Required</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>file</code>
|
||||
</td>
|
||||
<td>
|
||||
yes
|
||||
</td>
|
||||
<td>
|
||||
The path to check for the existence of a file.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>interval</code>
|
||||
</td>
|
||||
<td>
|
||||
no
|
||||
</td>
|
||||
<td>
|
||||
The length of time to wait between repetitions of the check. This field
|
||||
takes a positive integer and an optional suffix indicating the unit of
|
||||
time. Possible units are:
|
||||
<ul>
|
||||
<li><code>ns</code> (nanoseconds)</li>
|
||||
<li><code>us</code> (microseconds)</li>
|
||||
<li><code>ms</code> (milliseconds)</li>
|
||||
<li><code>s</code> (seconds)</li>
|
||||
<li><code>m</code> (minutes)</li>
|
||||
<li><code>h</code> (hours)</li>
|
||||
</ul>
|
||||
If you omit the suffix, the system interprets the value as nanoseconds.
|
||||
The default value is 10 seconds if this field is omitted.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>threshold</code>
|
||||
</td>
|
||||
<td>
|
||||
no
|
||||
</td>
|
||||
<td>
|
||||
An integer specifying the number of times the check must fail before the
|
||||
check triggers an unhealthy state. If this filed is not specified, a
|
||||
single failure will trigger an unhealthy state.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### http
|
||||
|
||||
http is a list of HTTP URIs to be periodically checked with HEAD requests. If
|
||||
a HEAD request returns a status code other than 200, the health check will fail.
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>Parameter</th>
|
||||
<th>Required</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>uri</code>
|
||||
</td>
|
||||
<td>
|
||||
yes
|
||||
</td>
|
||||
<td>
|
||||
The URI to check.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>interval</code>
|
||||
</td>
|
||||
<td>
|
||||
no
|
||||
</td>
|
||||
<td>
|
||||
The length of time to wait between repetitions of the check. This field
|
||||
takes a positive integer and an optional suffix indicating the unit of
|
||||
time. Possible units are:
|
||||
<ul>
|
||||
<li><code>ns</code> (nanoseconds)</li>
|
||||
<li><code>us</code> (microseconds)</li>
|
||||
<li><code>ms</code> (milliseconds)</li>
|
||||
<li><code>s</code> (seconds)</li>
|
||||
<li><code>m</code> (minutes)</li>
|
||||
<li><code>h</code> (hours)</li>
|
||||
</ul>
|
||||
If you omit the suffix, the system interprets the value as nanoseconds.
|
||||
The default value is 10 seconds if this field is omitted.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<code>threshold</code>
|
||||
</td>
|
||||
<td>
|
||||
no
|
||||
</td>
|
||||
<td>
|
||||
An integer specifying the number of times the check must fail before the
|
||||
check triggers an unhealthy state. If this filed is not specified, a
|
||||
single failure will trigger an unhealthy state.
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
## Example: Development configuration
|
||||
|
||||
|
@ -2,9 +2,11 @@ package checks
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"github.com/docker/distribution/health"
|
||||
"net/http"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/docker/distribution/health"
|
||||
)
|
||||
|
||||
// FileChecker checks the existence of a file and returns and error
|
||||
@ -28,7 +30,7 @@ func HTTPChecker(r string) health.Checker {
|
||||
return errors.New("error while checking: " + r)
|
||||
}
|
||||
if response.StatusCode != http.StatusOK {
|
||||
return errors.New("downstream service returned unexpected status: " + string(response.StatusCode))
|
||||
return errors.New("downstream service returned unexpected status: " + strconv.Itoa(response.StatusCode))
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
@ -170,6 +170,20 @@ func Register(name string, check Checker) {
|
||||
registeredChecks[name] = check
|
||||
}
|
||||
|
||||
// Unregister removes the named checker.
|
||||
func Unregister(name string) {
|
||||
mutex.Lock()
|
||||
defer mutex.Unlock()
|
||||
delete(registeredChecks, name)
|
||||
}
|
||||
|
||||
// UnregisterAll removes all registered checkers.
|
||||
func UnregisterAll() {
|
||||
mutex.Lock()
|
||||
defer mutex.Unlock()
|
||||
registeredChecks = make(map[string]Checker)
|
||||
}
|
||||
|
||||
// RegisterFunc allows the convenience of registering a checker directly
|
||||
// from an arbitrary func() error
|
||||
func RegisterFunc(name string, check func() error) {
|
||||
|
@ -15,6 +15,7 @@ import (
|
||||
"github.com/docker/distribution/configuration"
|
||||
ctxu "github.com/docker/distribution/context"
|
||||
"github.com/docker/distribution/health"
|
||||
"github.com/docker/distribution/health/checks"
|
||||
"github.com/docker/distribution/notifications"
|
||||
"github.com/docker/distribution/registry/api/errcode"
|
||||
"github.com/docker/distribution/registry/api/v2"
|
||||
@ -37,6 +38,9 @@ import (
|
||||
// was specified.
|
||||
const randomSecretSize = 32
|
||||
|
||||
// defaultCheckInterval is the default time in between health checks
|
||||
const defaultCheckInterval = 10 * time.Second
|
||||
|
||||
// App is a global registry application object. Shared resources can be placed
|
||||
// on this object that will be accessible from all requests. Any writable
|
||||
// fields should be protected.
|
||||
@ -231,10 +235,38 @@ func NewApp(ctx context.Context, configuration configuration.Configuration) *App
|
||||
// implementing this properly will require a refactor. This method may panic
|
||||
// if called twice in the same process.
|
||||
func (app *App) RegisterHealthChecks() {
|
||||
health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), 10*time.Second, 3, func() error {
|
||||
health.RegisterPeriodicThresholdFunc("storagedriver_"+app.Config.Storage.Type(), defaultCheckInterval, 3, func() error {
|
||||
_, err := app.driver.List(app, "/") // "/" should always exist
|
||||
return err // any error will be treated as failure
|
||||
})
|
||||
|
||||
for _, fileChecker := range app.Config.Health.FileCheckers {
|
||||
interval := fileChecker.Interval
|
||||
if interval == 0 {
|
||||
interval = defaultCheckInterval
|
||||
}
|
||||
if fileChecker.Threshold != 0 {
|
||||
ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d, threshold=%d", fileChecker.File, interval/time.Second, fileChecker.Threshold)
|
||||
health.Register(fileChecker.File, health.PeriodicThresholdChecker(checks.FileChecker(fileChecker.File), interval, fileChecker.Threshold))
|
||||
} else {
|
||||
ctxu.GetLogger(app).Infof("configuring file health check path=%s, interval=%d", fileChecker.File, interval/time.Second)
|
||||
health.Register(fileChecker.File, health.PeriodicChecker(checks.FileChecker(fileChecker.File), interval))
|
||||
}
|
||||
}
|
||||
|
||||
for _, httpChecker := range app.Config.Health.HTTPCheckers {
|
||||
interval := httpChecker.Interval
|
||||
if interval == 0 {
|
||||
interval = defaultCheckInterval
|
||||
}
|
||||
if httpChecker.Threshold != 0 {
|
||||
ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d, threshold=%d", httpChecker.URI, interval/time.Second, httpChecker.Threshold)
|
||||
health.Register(httpChecker.URI, health.PeriodicThresholdChecker(checks.HTTPChecker(httpChecker.URI), interval, httpChecker.Threshold))
|
||||
} else {
|
||||
ctxu.GetLogger(app).Infof("configuring HTTP health check uri=%s, interval=%d", httpChecker.URI, interval/time.Second)
|
||||
health.Register(httpChecker.URI, health.PeriodicChecker(checks.HTTPChecker(httpChecker.URI), interval))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// register a handler with the application, by route name. The handler will be
|
||||
|
200
registry/handlers/health_test.go
Normal file
200
registry/handlers/health_test.go
Normal file
@ -0,0 +1,200 @@
|
||||
package handlers
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/docker/distribution/configuration"
|
||||
"github.com/docker/distribution/health"
|
||||
"golang.org/x/net/context"
|
||||
)
|
||||
|
||||
func TestFileHealthCheck(t *testing.T) {
|
||||
// In case other tests registered checks before this one
|
||||
health.UnregisterAll()
|
||||
|
||||
interval := time.Second
|
||||
|
||||
tmpfile, err := ioutil.TempFile(os.TempDir(), "healthcheck")
|
||||
if err != nil {
|
||||
t.Fatalf("could not create temporary file: %v", err)
|
||||
}
|
||||
defer tmpfile.Close()
|
||||
|
||||
config := configuration.Configuration{
|
||||
Storage: configuration.Storage{
|
||||
"inmemory": configuration.Parameters{},
|
||||
},
|
||||
Health: configuration.Health{
|
||||
FileCheckers: []configuration.FileChecker{
|
||||
{
|
||||
Interval: interval,
|
||||
File: tmpfile.Name(),
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
app := NewApp(ctx, config)
|
||||
app.RegisterHealthChecks()
|
||||
|
||||
debugServer := httptest.NewServer(nil)
|
||||
|
||||
// Wait for health check to happen
|
||||
<-time.After(2 * interval)
|
||||
|
||||
resp, err := http.Get(debugServer.URL + "/debug/health")
|
||||
if err != nil {
|
||||
t.Fatalf("error performing HTTP GET: %v", err)
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("error reading HTTP body: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
var decoded map[string]string
|
||||
err = json.Unmarshal(body, &decoded)
|
||||
if err != nil {
|
||||
t.Fatalf("error unmarshaling json: %v", err)
|
||||
}
|
||||
if len(decoded) != 1 {
|
||||
t.Fatal("expected 1 item in returned json")
|
||||
}
|
||||
if decoded[tmpfile.Name()] != "file exists" {
|
||||
t.Fatal(`did not get "file exists" result for health check`)
|
||||
}
|
||||
|
||||
os.Remove(tmpfile.Name())
|
||||
|
||||
<-time.After(2 * interval)
|
||||
resp, err = http.Get(debugServer.URL + "/debug/health")
|
||||
if err != nil {
|
||||
t.Fatalf("error performing HTTP GET: %v", err)
|
||||
}
|
||||
body, err = ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("error reading HTTP body: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
var decoded2 map[string]string
|
||||
err = json.Unmarshal(body, &decoded2)
|
||||
if err != nil {
|
||||
t.Fatalf("error unmarshaling json: %v", err)
|
||||
}
|
||||
if len(decoded2) != 0 {
|
||||
t.Fatal("expected 0 items in returned json")
|
||||
}
|
||||
}
|
||||
|
||||
func TestHTTPHealthCheck(t *testing.T) {
|
||||
// In case other tests registered checks before this one
|
||||
health.UnregisterAll()
|
||||
|
||||
interval := time.Second
|
||||
threshold := 3
|
||||
|
||||
stopFailing := make(chan struct{})
|
||||
|
||||
checkedServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != "HEAD" {
|
||||
t.Fatalf("expected HEAD request, got %s", r.Method)
|
||||
}
|
||||
select {
|
||||
case <-stopFailing:
|
||||
w.WriteHeader(http.StatusOK)
|
||||
default:
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
}
|
||||
}))
|
||||
|
||||
config := configuration.Configuration{
|
||||
Storage: configuration.Storage{
|
||||
"inmemory": configuration.Parameters{},
|
||||
},
|
||||
Health: configuration.Health{
|
||||
HTTPCheckers: []configuration.HTTPChecker{
|
||||
{
|
||||
Interval: interval,
|
||||
URI: checkedServer.URL,
|
||||
Threshold: threshold,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
app := NewApp(ctx, config)
|
||||
app.RegisterHealthChecks()
|
||||
|
||||
debugServer := httptest.NewServer(nil)
|
||||
|
||||
for i := 0; ; i++ {
|
||||
<-time.After(interval)
|
||||
|
||||
resp, err := http.Get(debugServer.URL + "/debug/health")
|
||||
if err != nil {
|
||||
t.Fatalf("error performing HTTP GET: %v", err)
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("error reading HTTP body: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
var decoded map[string]string
|
||||
err = json.Unmarshal(body, &decoded)
|
||||
if err != nil {
|
||||
t.Fatalf("error unmarshaling json: %v", err)
|
||||
}
|
||||
|
||||
if i < threshold-1 {
|
||||
// definitely shouldn't have hit the threshold yet
|
||||
if len(decoded) != 0 {
|
||||
t.Fatal("expected 1 items in returned json")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i < threshold+1 {
|
||||
// right on the threshold - don't expect a failure yet
|
||||
continue
|
||||
}
|
||||
|
||||
if len(decoded) != 1 {
|
||||
t.Fatal("expected 1 item in returned json")
|
||||
}
|
||||
if decoded[checkedServer.URL] != "downstream service returned unexpected status: 500" {
|
||||
t.Fatal("did not get expected result for health check")
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
// Signal HTTP handler to start returning 200
|
||||
close(stopFailing)
|
||||
|
||||
<-time.After(2 * interval)
|
||||
resp, err := http.Get(debugServer.URL + "/debug/health")
|
||||
if err != nil {
|
||||
t.Fatalf("error performing HTTP GET: %v", err)
|
||||
}
|
||||
body, err := ioutil.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
t.Fatalf("error reading HTTP body: %v", err)
|
||||
}
|
||||
resp.Body.Close()
|
||||
var decoded map[string]string
|
||||
err = json.Unmarshal(body, &decoded)
|
||||
if err != nil {
|
||||
t.Fatalf("error unmarshaling json: %v", err)
|
||||
}
|
||||
if len(decoded) != 0 {
|
||||
t.Fatal("expected 0 items in returned json")
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user