From f2cd05b8d60d27f43b07175b92ef4c2a69b8e3a2 Mon Sep 17 00:00:00 2001 From: Joel Colledge Date: Wed, 6 Sep 2023 15:49:44 +0200 Subject: [PATCH 02/20] drbd: improve decision about marking a failed disk Outdated Sometimes it is possible to update the metadata even after our disk has failed. We were too eager to remove the MDF_WAS_UP_TO_DATE flag in this case. Firstly, we used the "NOW" states, so would mark our metadata Outdated if we were a Primary with UpToDate data and no peers, and our disk failed. Use the "NEW" states instead. Secondly, do not consider peers that are disconnecting, because they will not see that our disk state is Failed, and so will outdate themselves. We do not want to outdate both nodes in this situation. --- drbd/drbd_state.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drbd/drbd_state.c b/drbd/drbd_state.c index 7e6e3477893d..8b60afeb097b 100644 --- a/drbd/drbd_state.c +++ b/drbd/drbd_state.c @@ -2489,15 +2489,24 @@ static void initialize_resync(struct drbd_peer_device *peer_device) /* Is there a primary with access to up to date data known */ static bool primary_and_data_present(struct drbd_device *device) { - bool up_to_date_data = device->disk_state[NOW] == D_UP_TO_DATE; - bool primary = device->resource->role[NOW] == R_PRIMARY; + bool up_to_date_data = device->disk_state[NEW] == D_UP_TO_DATE; + struct drbd_resource *resource = device->resource; + bool primary = resource->role[NEW] == R_PRIMARY; struct drbd_peer_device *peer_device; for_each_peer_device(peer_device, device) { - if (peer_device->connection->peer_role[NOW] == R_PRIMARY) + struct drbd_connection *connection = peer_device->connection; + + /* Do not consider the peer if we are disconnecting. */ + if (resource->remote_state_change && + drbd_twopc_between_peer_and_me(connection) && + resource->twopc_reply.is_disconnect) + continue; + + if (connection->peer_role[NEW] == R_PRIMARY) primary = true; - if (peer_device->disk_state[NOW] == D_UP_TO_DATE) + if (peer_device->disk_state[NEW] == D_UP_TO_DATE) up_to_date_data = true; } @@ -4808,6 +4817,7 @@ change_cluster_wide_state(bool (*change)(struct change_context *, enum change_ph } else if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) { reply->target_reachable_nodes = NODE_MASK(context->target_node_id); reply->reachable_nodes &= ~reply->target_reachable_nodes; + reply->is_disconnect = 1; } else { reply->target_reachable_nodes = reply->reachable_nodes; } -- 2.35.3