pacemaker/bsc-1094208-Fix-controld-able-to-manually-confirm-unseen-nodes-a.patch
Yan Gao f4fa43815b Accepting request 640081 from home:yan_gao:branches:network:ha-clustering:Factory:Test
- fenced: Handle fencing requested with nodeid by utilizing the membership cache of known nodes (bsc#1094208)
  * bsc-1094208-Refactor-fenced-Handle-fencing-requested-with-nodeid.patch
- controld: able to manually confirm unseen nodes are down (bsc#1094208)
  * bsc-1094208-Fix-controld-able-to-manually-confirm-unseen-nodes-a.patch

- Update to version 2.0.0+20180927.b67d8d0de:
- logrotate: set a maximum size for logs
- tools: ensure crm_resource --force-* commands get stderr messages
- libcrmcommon: properly check whether resource supports parameters
- tools: "return" from crm_mon after calling functions that don't
- alerts: send all MIB OIDs with all SNMP alerts
- resource-agents: add "s"-suffix where missing in metadata
- libcommon: do not write to /proc/sys/kernel/sysrq when unneeded
- pacemaker-based: drop declared, errant option never backed in tree
- crm_mon: don't exit directly from cib_connect on error
- scheduler: honor asymmetric orderings even when restarting

OBS-URL: https://build.opensuse.org/request/show/640081
OBS-URL: https://build.opensuse.org/package/show/network:ha-clustering:Factory/pacemaker?expand=0&rev=311
2018-10-05 13:20:59 +00:00

258 lines
8.9 KiB
Diff

From 73a0ee287cd48ee10ed28f9071459d40d74e8801 Mon Sep 17 00:00:00 2001
From: "Gao,Yan" <ygao@suse.com>
Date: Fri, 1 Jun 2018 15:23:49 +0200
Subject: [PATCH 1/2] Fix: controld: able to manually confirm unseen nodes are
down
9045bacb4 prevented manual fencing confirmations from creating node
entries for random unknown nodes, but it also disabled the ability to do
manual fencing confirmations for the nodes that are already known in the
CIB but not yet in the membership cache.
This commit fixes it by maintaining and utilizing an additional
membership cache of known nodes based on the CIB.
---
daemons/controld/controld_schedulerd.c | 5 +-
daemons/controld/controld_te_utils.c | 2 +-
include/crm/cluster/internal.h | 3 +
lib/cluster/membership.c | 164 +++++++++++++++++++++++++++++++++
4 files changed, 171 insertions(+), 3 deletions(-)
diff --git a/daemons/controld/controld_schedulerd.c b/daemons/controld/controld_schedulerd.c
index e5d5f69b0..4b53aaa97 100644
--- a/daemons/controld/controld_schedulerd.c
+++ b/daemons/controld/controld_schedulerd.c
@@ -355,8 +355,9 @@ do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void
CRM_LOG_ASSERT(output != NULL);
- // Refresh the remote node cache when the scheduler is invoked
- crm_remote_peer_cache_refresh(output);
+ /* Refresh the remote node cache and the known node cache when the
+ * scheduler is invoked */
+ crm_peer_caches_refresh(output);
crm_xml_add(output, XML_ATTR_DC_UUID, fsa_our_uuid);
crm_xml_add_int(output, XML_ATTR_HAVE_QUORUM, fsa_has_quorum);
diff --git a/daemons/controld/controld_te_utils.c b/daemons/controld/controld_te_utils.c
index 3f538b9bc..5606ed654 100644
--- a/daemons/controld/controld_te_utils.c
+++ b/daemons/controld/controld_te_utils.c
@@ -269,7 +269,7 @@ tengine_stonith_notify(stonith_t * st, stonith_event_t * st_event)
st_event->origin, st_event->id);
if (st_event->result == pcmk_ok) {
- crm_node_t *peer = crm_find_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
+ crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
const char *uuid = NULL;
gboolean we_are_executioner = safe_str_eq(st_event->executioner, fsa_our_uname);
diff --git a/include/crm/cluster/internal.h b/include/crm/cluster/internal.h
index 369f22700..12bf41ab0 100644
--- a/include/crm/cluster/internal.h
+++ b/include/crm/cluster/internal.h
@@ -329,4 +329,7 @@ gboolean node_name_is_valid(const char *key, const char *name);
crm_node_t * crm_find_peer_full(unsigned int id, const char *uname, int flags);
crm_node_t * crm_find_peer(unsigned int id, const char *uname);
+void crm_peer_caches_refresh(xmlNode *cib);
+crm_node_t *crm_find_known_peer_full(unsigned int id, const char *uname, int flags);
+
#endif
diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c
index a487e762a..e5151f2b7 100644
--- a/lib/cluster/membership.c
+++ b/lib/cluster/membership.c
@@ -50,6 +50,8 @@ GHashTable *crm_peer_cache = NULL;
*/
GHashTable *crm_remote_peer_cache = NULL;
+GHashTable *crm_known_peer_cache = NULL;
+
unsigned long long crm_peer_seq = 0;
gboolean crm_have_quorum = FALSE;
static gboolean crm_autoreap = TRUE;
@@ -394,6 +396,10 @@ crm_peer_init(void)
if (crm_remote_peer_cache == NULL) {
crm_remote_peer_cache = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, NULL, destroy_crm_node);
}
+
+ if (crm_known_peer_cache == NULL) {
+ crm_known_peer_cache = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, NULL, destroy_crm_node);
+ }
}
void
@@ -410,6 +416,13 @@ crm_peer_destroy(void)
g_hash_table_destroy(crm_remote_peer_cache);
crm_remote_peer_cache = NULL;
}
+
+ if (crm_known_peer_cache != NULL) {
+ crm_trace("Destroying known peer cache with %d members", g_hash_table_size(crm_known_peer_cache));
+ g_hash_table_destroy(crm_known_peer_cache);
+ crm_known_peer_cache = NULL;
+ }
+
}
void (*crm_status_callback) (enum crm_status_type, crm_node_t *, const void *) = NULL;
@@ -1001,3 +1014,154 @@ crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection)
{
return stonith_api_kick(nodeid, uname, 120, TRUE);
}
+
+static void
+known_peer_cache_refresh_helper(xmlNode *xml_node, void *user_data)
+{
+ const char *id = crm_element_value(xml_node, XML_ATTR_ID);
+ const char *uname = crm_element_value(xml_node, XML_ATTR_UNAME);
+ crm_node_t * node = NULL;
+
+ CRM_CHECK(id != NULL && uname !=NULL, return);
+ node = g_hash_table_lookup(crm_known_peer_cache, id);
+
+ if (node == NULL) {
+ node = calloc(1, sizeof(crm_node_t));
+ if (node == NULL) {
+ errno = -ENOMEM;
+ return;
+ }
+
+ node->uname = strdup(uname);
+ node->uuid = strdup(id);
+ if (node->uname == NULL || node->uuid == NULL) {
+ free(node);
+ errno = -ENOMEM;
+ return;
+ }
+
+ g_hash_table_replace(crm_known_peer_cache, node->uuid, node);
+
+ } else if (is_set(node->flags, crm_node_dirty)) {
+ if (safe_str_neq(uname, node->uname)) {
+ free(node->uname);
+ node->uname = strdup(uname);
+ CRM_ASSERT(node->uname != NULL);
+ }
+
+ /* Node is in cache and hasn't been updated already, so mark it clean */
+ clear_bit(node->flags, crm_node_dirty);
+ }
+
+}
+
+#define XPATH_MEMBER_NODE_CONFIG \
+ "//" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_NODES \
+ "/" XML_CIB_TAG_NODE "[not(@type) or @type='member']"
+
+static void
+crm_known_peer_cache_refresh(xmlNode *cib)
+{
+ crm_peer_init();
+
+ g_hash_table_foreach(crm_known_peer_cache, mark_dirty, NULL);
+
+ crm_foreach_xpath_result(cib, XPATH_MEMBER_NODE_CONFIG,
+ known_peer_cache_refresh_helper, NULL);
+
+ /* Remove all old cache entries that weren't seen in the CIB */
+ g_hash_table_foreach_remove(crm_known_peer_cache, is_dirty, NULL);
+}
+
+void
+crm_peer_caches_refresh(xmlNode *cib)
+{
+ crm_remote_peer_cache_refresh(cib);
+ crm_known_peer_cache_refresh(cib);
+}
+
+crm_node_t *
+crm_find_known_peer_full(unsigned int id, const char *uname, int flags)
+{
+ GHashTableIter iter;
+ crm_node_t *node = NULL;
+ crm_node_t *by_id = NULL;
+ crm_node_t *by_name = NULL;
+
+ CRM_ASSERT(id > 0 || uname != NULL);
+
+ node = crm_find_peer_full(id, uname, flags);
+
+ if (node || !(flags & CRM_GET_PEER_CLUSTER)) {
+ return node;
+ }
+
+ if (uname != NULL) {
+ g_hash_table_iter_init(&iter, crm_known_peer_cache);
+ while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
+ if (node->uname && strcasecmp(node->uname, uname) == 0) {
+ crm_trace("Name match: %s = %p", node->uname, node);
+ by_name = node;
+ break;
+ }
+ }
+ }
+
+ if (id > 0) {
+ char * id_str = crm_strdup_printf("%u", id);
+
+ g_hash_table_iter_init(&iter, crm_known_peer_cache);
+ while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
+ if(node->id == id || strcasecmp(node->uuid, id_str) == 0) {
+ crm_trace("ID match: %u = %p", id, node);
+ by_id = node;
+ break;
+ }
+ }
+ free(id_str);
+ }
+
+ node = by_id; /* Good default */
+ if (by_id == by_name) {
+ /* Nothing to do if they match (both NULL counts) */
+ crm_trace("Consistent: %p for %u/%s", by_id, id, uname);
+
+ } else if (by_id == NULL && by_name) {
+ crm_trace("Only one: %p for %u/%s", by_name, id, uname);
+
+ if (id && by_name->id) {
+ crm_notice("Node %u and %u share the same name '%s'",
+ id, by_name->id, uname);
+ node = NULL;
+
+ } else if (id && by_name->uuid) {
+ crm_notice("Node %u and %s share the same name '%s'",
+ id, by_name->uuid, uname);
+ node = NULL;
+
+ } else {
+ node = by_name;
+ }
+
+ } else if (by_name == NULL && by_id) {
+ crm_trace("Only one: %p for %u/%s", by_id, id, uname);
+
+ if (uname && by_id->uname) {
+ crm_notice("Node '%s' and '%s' share the same cluster nodeid %u",
+ uname, by_id->uname, id);
+ }
+
+ } else if (uname && by_id->uname) {
+ if (safe_str_eq(uname, by_id->uname)) {
+ crm_notice("Node '%s' has changed its ID from %u to %u", by_id->uname, by_name->id, by_id->id);
+
+ } else {
+ crm_notice("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname);
+ }
+
+ } else if (id && by_name->id) {
+ crm_notice("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname);
+ }
+
+ return node;
+}
--
2.16.4