66 lines
2.5 KiB
Diff
66 lines
2.5 KiB
Diff
|
From f2cd05b8d60d27f43b07175b92ef4c2a69b8e3a2 Mon Sep 17 00:00:00 2001
|
||
|
From: Joel Colledge <joel.colledge@linbit.com>
|
||
|
Date: Wed, 6 Sep 2023 15:49:44 +0200
|
||
|
Subject: [PATCH 02/20] drbd: improve decision about marking a failed disk
|
||
|
Outdated
|
||
|
|
||
|
Sometimes it is possible to update the metadata even after our disk has
|
||
|
failed. We were too eager to remove the MDF_WAS_UP_TO_DATE flag in this
|
||
|
case.
|
||
|
|
||
|
Firstly, we used the "NOW" states, so would mark our metadata Outdated
|
||
|
if we were a Primary with UpToDate data and no peers, and our disk
|
||
|
failed. Use the "NEW" states instead.
|
||
|
|
||
|
Secondly, do not consider peers that are disconnecting, because they
|
||
|
will not see that our disk state is Failed, and so will outdate
|
||
|
themselves. We do not want to outdate both nodes in this situation.
|
||
|
---
|
||
|
drbd/drbd_state.c | 18 ++++++++++++++----
|
||
|
1 file changed, 14 insertions(+), 4 deletions(-)
|
||
|
|
||
|
diff --git a/drbd/drbd_state.c b/drbd/drbd_state.c
|
||
|
index 7e6e3477893d..8b60afeb097b 100644
|
||
|
--- a/drbd/drbd_state.c
|
||
|
+++ b/drbd/drbd_state.c
|
||
|
@@ -2489,15 +2489,24 @@ static void initialize_resync(struct drbd_peer_device *peer_device)
|
||
|
/* Is there a primary with access to up to date data known */
|
||
|
static bool primary_and_data_present(struct drbd_device *device)
|
||
|
{
|
||
|
- bool up_to_date_data = device->disk_state[NOW] == D_UP_TO_DATE;
|
||
|
- bool primary = device->resource->role[NOW] == R_PRIMARY;
|
||
|
+ bool up_to_date_data = device->disk_state[NEW] == D_UP_TO_DATE;
|
||
|
+ struct drbd_resource *resource = device->resource;
|
||
|
+ bool primary = resource->role[NEW] == R_PRIMARY;
|
||
|
struct drbd_peer_device *peer_device;
|
||
|
|
||
|
for_each_peer_device(peer_device, device) {
|
||
|
- if (peer_device->connection->peer_role[NOW] == R_PRIMARY)
|
||
|
+ struct drbd_connection *connection = peer_device->connection;
|
||
|
+
|
||
|
+ /* Do not consider the peer if we are disconnecting. */
|
||
|
+ if (resource->remote_state_change &&
|
||
|
+ drbd_twopc_between_peer_and_me(connection) &&
|
||
|
+ resource->twopc_reply.is_disconnect)
|
||
|
+ continue;
|
||
|
+
|
||
|
+ if (connection->peer_role[NEW] == R_PRIMARY)
|
||
|
primary = true;
|
||
|
|
||
|
- if (peer_device->disk_state[NOW] == D_UP_TO_DATE)
|
||
|
+ if (peer_device->disk_state[NEW] == D_UP_TO_DATE)
|
||
|
up_to_date_data = true;
|
||
|
}
|
||
|
|
||
|
@@ -4808,6 +4817,7 @@ change_cluster_wide_state(bool (*change)(struct change_context *, enum change_ph
|
||
|
} else if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) {
|
||
|
reply->target_reachable_nodes = NODE_MASK(context->target_node_id);
|
||
|
reply->reachable_nodes &= ~reply->target_reachable_nodes;
|
||
|
+ reply->is_disconnect = 1;
|
||
|
} else {
|
||
|
reply->target_reachable_nodes = reply->reachable_nodes;
|
||
|
}
|
||
|
--
|
||
|
2.35.3
|
||
|
|