From 5174ef33576f461d43f43b2019f5e10655b4c78f Mon Sep 17 00:00:00 2001 From: dormando Date: Wed, 25 Mar 2020 14:02:11 -0700 Subject: [PATCH] restart: fix rare segfault on shutdown Client connections were being closed and cleaned up after worker threads exit. In 2018 a patch went in to have the worker threads actually free their event base when stopped. If your system is strict enough (which is apparently none out of the dozen+ systems we've tested against!) it will segfault on invalid memory. This change leaves the workers hung while they wait for connections to be centrally closed. I would prefer to have each worker thread close its own connections for speed if nothing else, but we still need to close the listener connections and any connections currently open in side channels. Much apprecation to darix for helping narrow this down, as it presented as a wiped stack that only appeared in a specific build environment on a specific linux distribution. Hopefully with all of the valgrind noise fixes lately we can start running it more regularly and spot these early. --- memcached.c | 19 ++++++++++++------- memcached.h | 3 +-- thread.c | 14 ++++++++++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/memcached.c b/memcached.c index 2592b3f94..dd52dd04c 100644 --- a/memcached.c +++ b/memcached.c @@ -939,6 +939,18 @@ static void conn_close(conn *c) { return; } +// Since some connections might be off on side threads and some are managed as +// listeners we need to walk through them all from a central point. +// Must be called with all worker threads hung or in the process of closing. +void conn_close_all(void) { + int i; + for (i = 0; i < max_fds; i++) { + if (conns[i] && conns[i]->state != conn_closed) { + conn_close(conns[i]); + } + } +} + /** * Convert a state name to a human readable form. */ @@ -10126,13 +10138,6 @@ int main (int argc, char **argv) { fprintf(stderr, "Gracefully stopping\n"); stop_threads(); - int i; - // FIXME: make a function callable from threads.c - for (i = 0; i < max_fds; i++) { - if (conns[i] && conns[i]->state != conn_closed) { - conn_close(conns[i]); - } - } if (memory_file != NULL) { restart_mmap_close(); } diff --git a/memcached.h b/memcached.h index a3ddd88c1..bdc38bd9c 100644 --- a/memcached.h +++ b/memcached.h @@ -820,9 +820,8 @@ enum delta_result_type add_delta(conn *c, const char *key, const int64_t delta, char *buf, uint64_t *cas); void accept_new_conns(const bool do_accept); -conn *conn_from_freelist(void); -bool conn_add_to_freelist(conn *c); void conn_close_idle(conn *c); +void conn_close_all(void); item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes); #define DO_UPDATE true #define DONT_UPDATE false diff --git a/thread.c b/thread.c index ed9765a48..e7f96dcba 100644 --- a/thread.c +++ b/thread.c @@ -204,6 +204,7 @@ void stop_threads(void) { if (settings.verbose > 0) fprintf(stderr, "asking workers to stop\n"); buf[0] = 's'; + pthread_mutex_lock(&worker_hang_lock); pthread_mutex_lock(&init_lock); init_count = 0; for (i = 0; i < settings.num_threads; i++) { @@ -215,6 +216,8 @@ void stop_threads(void) { wait_for_thread_registration(settings.num_threads); pthread_mutex_unlock(&init_lock); + // All of the workers are hung but haven't done cleanup yet. + if (settings.verbose > 0) fprintf(stderr, "asking background threads to stop\n"); @@ -236,6 +239,17 @@ void stop_threads(void) { if (settings.verbose > 0) fprintf(stderr, "stopped idle timeout thread\n"); + // Close all connections then let the workers finally exit. + if (settings.verbose > 0) + fprintf(stderr, "closing connections\n"); + conn_close_all(); + pthread_mutex_unlock(&worker_hang_lock); + if (settings.verbose > 0) + fprintf(stderr, "reaping worker threads\n"); + for (i = 0; i < settings.num_threads; i++) { + pthread_join(threads[i].thread_id, NULL); + } + if (settings.verbose > 0) fprintf(stderr, "all background threads stopped\n");