93 lines
3.6 KiB
Diff
93 lines
3.6 KiB
Diff
|
From 4f0681b1a296d88ac1dbdb26e46afed3285ad1bf Mon Sep 17 00:00:00 2001
|
||
|
From: Eric Ren <zren@suse.com>
|
||
|
Date: Tue, 23 May 2017 15:09:46 +0800
|
||
|
Subject: [PATCH 09/10] clvmd: try to refresh device cache on the first failure
|
||
|
|
||
|
1. The original problem
|
||
|
$ sudo lvchange -ay testvg/testlv
|
||
|
Error locking on node 1302cf30: Volume group for uuid not found:
|
||
|
qBKu65bSxfRq7gUf91NZuH4epLza4ifDieQJFd2to2WruVi5Brn7DxxsEgi5Zodw
|
||
|
|
||
|
2. This problem can be easily replicated
|
||
|
a. Make clvmd running in cluster environment;
|
||
|
b. Assume you have created LV "testlv" in local VG 'testvg' on
|
||
|
a MD device 'md0';
|
||
|
c. Make sure 'md0' is stopped, and not in the device cache by
|
||
|
executing 'clvmd -R' or 'pvscan';
|
||
|
d. Assemble 'md0' by issuing 'mdadm --assemble --scan --name md0';
|
||
|
e. To activate 'testlv', you will see the 'Error locking' problem.
|
||
|
|
||
|
3. Analysis
|
||
|
a. After step 2.d, 'pvscan --cache ...' is triggered by udev rules,
|
||
|
notifying 'md0' is ready. But, pvscan exits very early because
|
||
|
lvmetad is not being used, thus doesn't go through the lock manager.
|
||
|
Therefore, clvmd isn't aware of this udev events. The device cache
|
||
|
hasn't 'md0'.
|
||
|
|
||
|
b. In step 2.e, the client, 'lvchange -ay testvg/testlv' cmd, can find
|
||
|
'testlv' correctly in the client metadata, because the device list
|
||
|
is gathered by call chain:
|
||
|
lvm_run_command()->init_filters()->persistent_filter_load()->dev_cache_scan().
|
||
|
Then, it asks clvmd for "Locking VG V_testvg CR", which just drops
|
||
|
the metadata in clmvd by call chain: do_lock_vg()->lvmcache_drop_metadata(),
|
||
|
but the device cache is *not* refreshed.
|
||
|
|
||
|
c. Finally, clvmd fails to find the lvid in activation path:
|
||
|
do_lock_lv()->do_activate_lv()->lv_info_by_lvid()
|
||
|
|
||
|
Apparently, the metadata DB is not complete without a complete device
|
||
|
cache in clvmd. However, upstream say the pvscan tool intends to be
|
||
|
only used with lvmetad, suggesting me not hacking there. So, we'd
|
||
|
better fix this issue within clvmd code.
|
||
|
|
||
|
Sometimes, the device cache in clvmd could be out of date.
|
||
|
"clvmd -R" is invented for this issue. However, to run
|
||
|
"clvmd -R" manually is not convenient, because it's hard
|
||
|
to predict when device change would happen.
|
||
|
|
||
|
This patch gives another try after refreshing the device
|
||
|
cache. In normal, it doesn't cause any side-effect. In
|
||
|
case of the issue above, it's worth a retry.
|
||
|
|
||
|
Signed-off-by: Eric Ren <zren@suse.com>
|
||
|
---
|
||
|
daemons/clvmd/lvm-functions.c | 11 ++++++++++-
|
||
|
1 file changed, 10 insertions(+), 1 deletion(-)
|
||
|
|
||
|
diff --git a/daemons/clvmd/lvm-functions.c b/daemons/clvmd/lvm-functions.c
|
||
|
index 2446fd1..dcd3f9b 100644
|
||
|
--- a/daemons/clvmd/lvm-functions.c
|
||
|
+++ b/daemons/clvmd/lvm-functions.c
|
||
|
@@ -509,11 +509,14 @@ const char *do_lock_query(char *resource)
|
||
|
int do_lock_lv(unsigned char command, unsigned char lock_flags, char *resource)
|
||
|
{
|
||
|
int status = 0;
|
||
|
+ int do_refresh = 0;
|
||
|
|
||
|
DEBUGLOG("do_lock_lv: resource '%s', cmd = %s, flags = %s, critical_section = %d\n",
|
||
|
resource, decode_locking_cmd(command), decode_flags(lock_flags), critical_section());
|
||
|
|
||
|
- if (!cmd->initialized.config || config_files_changed(cmd)) {
|
||
|
+again:
|
||
|
+ if (!cmd->initialized.config || config_files_changed(cmd)
|
||
|
+ || do_refresh) {
|
||
|
/* Reinitialise various settings inc. logging, filters */
|
||
|
if (do_refresh_cache()) {
|
||
|
log_error("Updated config file invalid. Aborting.");
|
||
|
@@ -579,6 +582,12 @@ int do_lock_lv(unsigned char command, unsigned char lock_flags, char *resource)
|
||
|
init_test(0);
|
||
|
pthread_mutex_unlock(&lvm_lock);
|
||
|
|
||
|
+ /* Try again in case device cache is stale */
|
||
|
+ if (status == EIO && !do_refresh) {
|
||
|
+ do_refresh = 1;
|
||
|
+ goto again;
|
||
|
+ }
|
||
|
+
|
||
|
DEBUGLOG("Command return is %d, critical_section is %d\n", status, critical_section());
|
||
|
return status;
|
||
|
}
|
||
|
--
|
||
|
2.10.2
|
||
|
|