drbd/0002-drbd-improve-decision-about-marking-a-failed-disk-Ou.patch

From f2cd05b8d60d27f43b07175b92ef4c2a69b8e3a2 Mon Sep 17 00:00:00 2001
From: Joel Colledge <joel.colledge@linbit.com>
Date: Wed, 6 Sep 2023 15:49:44 +0200
Subject: [PATCH 02/20] drbd: improve decision about marking a failed disk
 Outdated

Sometimes it is possible to update the metadata even after our disk has
failed. We were too eager to remove the MDF_WAS_UP_TO_DATE flag in this
case.

Firstly, we used the "NOW" states, so would mark our metadata Outdated
if we were a Primary with UpToDate data and no peers, and our disk
failed. Use the "NEW" states instead.

Secondly, do not consider peers that are disconnecting, because they
will not see that our disk state is Failed, and so will outdate
themselves. We do not want to outdate both nodes in this situation.
---
 drbd/drbd_state.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drbd/drbd_state.c b/drbd/drbd_state.c
index 7e6e3477893d..8b60afeb097b 100644
--- a/drbd/drbd_state.c
+++ b/drbd/drbd_state.c
@@ -2489,15 +2489,24 @@ static void initialize_resync(struct drbd_peer_device *peer_device)
 /* Is there a primary with access to up to date data known */
 static bool primary_and_data_present(struct drbd_device *device)
 {
-	bool up_to_date_data = device->disk_state[NOW] == D_UP_TO_DATE;
-	bool primary = device->resource->role[NOW] == R_PRIMARY;
+	bool up_to_date_data = device->disk_state[NEW] == D_UP_TO_DATE;
+	struct drbd_resource *resource = device->resource;
+	bool primary = resource->role[NEW] == R_PRIMARY;
 	struct drbd_peer_device *peer_device;

 	for_each_peer_device(peer_device, device) {
-		if (peer_device->connection->peer_role[NOW] == R_PRIMARY)
+		struct drbd_connection *connection = peer_device->connection;
+
+		/* Do not consider the peer if we are disconnecting. */
+		if (resource->remote_state_change &&
+				drbd_twopc_between_peer_and_me(connection) &&
+				resource->twopc_reply.is_disconnect)
+			continue;
+
+		if (connection->peer_role[NEW] == R_PRIMARY)
 			primary = true;

-		if (peer_device->disk_state[NOW] == D_UP_TO_DATE)
+		if (peer_device->disk_state[NEW] == D_UP_TO_DATE)
 			up_to_date_data = true;
 	}

@@ -4808,6 +4817,7 @@ change_cluster_wide_state(bool (*change)(struct change_context *, enum change_ph
 	} else if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) {
 		reply->target_reachable_nodes = NODE_MASK(context->target_node_id);
 		reply->reachable_nodes &= ~reply->target_reachable_nodes;
+		reply->is_disconnect = 1;
 	} else {
 		reply->target_reachable_nodes = reply->reachable_nodes;
 	}
--
2.35.3