# HG changeset patch # User Dejan Muhamedagic # Date 1314632951 -7200 # Node ID ccd0c1e1edf9f23cafb4363014acba755f1b4e25 # Parent d21f988a419c0c7fa349c4e26f6b500944d91370 Medium: Shell: several history improvements - add more patterns for fencing - handle better PE files number reaching limit diff --git a/doc/crm.8.txt b/doc/crm.8.txt --- a/doc/crm.8.txt +++ b/doc/crm.8.txt @@ -2426,7 +2426,8 @@ Example: The `latest` command shows a bit of recent history, more precisely whatever happened since the last cluster change (the -latest transition). +latest transition). If the transition is running, the shell will +first wait until it finishes. Usage: ............... @@ -2540,10 +2541,13 @@ Example: setnodes node_a node_b ............... -[[cmdhelp_history_resource,resource failed actions]] +[[cmdhelp_history_resource,resource events]] ==== `resource` -Show status changes and any failures that happened on a resource. +Show actions and any failures that happened on all specified +resources on all nodes. Normally, one gives resource names as +arguments, but it is also possible to use extended regular +expressions. Usage: ............... @@ -2551,14 +2555,17 @@ Usage: ............... Example: ............... - resource mydb + resource bigdb public_ip + resource bigdb:0 + resource bigdb:. ............... [[cmdhelp_history_node,node events]] ==== `node` Show important events that happened on a node. Important events -are node lost and join, standby and online, and fence. +are node lost and join, standby and online, and fence. Use either +node names or extended regular expressions. Usage: ............... @@ -2572,7 +2579,17 @@ Example: [[cmdhelp_history_log,log content]] ==== `log` -Show logs for a node or combined logs of all nodes. +Show messages logged on one or more nodes. Leaving out a node +name produces combined logs of all nodes. Messages are sorted by +time and, if the terminal emulations supports it, displayed in +different colours depending on the node to allow for easier +reading. + +The sorting key is the timestamp as written by syslog which +normally has the maximum resolution of one second. Obviously, +messages generated by events which share the same timestamp may +not be sorted in the same way as they happened. Such close events +may actually happen fairly often. Usage: ............... @@ -2634,8 +2651,8 @@ the transition are printed. Usage: ............... - transition [|] [nograph] [v...] [scores] [actions] [utilization] - transition showdot [|] + transition [||] [nograph] [v...] [scores] [actions] [utilization] + transition showdot [||] ............... Examples: ............... diff --git a/shell/modules/log_patterns.py b/shell/modules/log_patterns.py --- a/shell/modules/log_patterns.py +++ b/shell/modules/log_patterns.py @@ -12,34 +12,41 @@ # detail level 0 is the lowest, i.e. should match the least # number of relevant messages -# NB: If you modify this file, you must follow python syntax! +# NB: +# %% stands for whatever user input we get, for instance a +# resource name or node name or just some regular expression +# in optimal case, it should be surrounded by literals +# +# [Note that resources may contain clone numbers!] log_patterns = { "resource": ( ( # detail 0 - "lrmd:.*rsc:%%.*(start|stop|promote|demote|migrate)", - "lrmd:.*RA output:.*%%.*stderr", - "lrmd:.*WARN:.*Managed.*%%.*exited", + "lrmd:.*rsc:%% (start|stop|promote|demote|migrate)", + "lrmd:.*RA output: .%%:.*:stderr", + "lrmd:.*WARN: Managed %%:.*exited", ), ( # detail 1 - "lrmd:.*rsc:%%.*(probe|notify)", - "lrmd:.*info:.*Managed.*%%.*exited", + "lrmd:.*rsc:%%:.*(probe|notify)", + "lrmd:.*info: Managed %%:.*exited", ), ), "node": ( ( # detail 0 - "%%.*Corosync.Cluster.Engine", - "%%.*Executive.Service.RELEASE", - "%%.*crm_shutdown:.Requesting.shutdown", - "%%.*pcmk_shutdown:.Shutdown.complete", - "%%.*Configuration.validated..Starting.heartbeat", - "pengine.*Scheduling Node %%", - "te_fence_node.*Exec.*%%", - "stonith-ng.*log_oper.*reboot.*%%", - "stonithd.*to STONITH.*%%", - "stonithd.*fenced node %%", - "pcmk_peer_update.*(lost|memb): %%", - "crmd.*ccm_event.*(NEW|LOST) %%", + " %% .*Corosync.Cluster.Engine", + " %% .*Executive.Service.RELEASE", + " %% .*crm_shutdown:.Requesting.shutdown", + " %% .*pcmk_shutdown:.Shutdown.complete", + " %% .*Configuration.validated..Starting.heartbeat", + "pengine.*Scheduling Node %% for STONITH", + "crmd.* tengine_stonith_callback: .* of %% failed", + "stonith-ng.*log_operation:.*host '%%'", + "te_fence_node: Exec.*on %% ", + "pe_fence_node: Node %% will be fenced", + "stonith-ng.*remote_op_timeout:.*for %% timed", + "stonithd.*Succeeded.*node %%:", + "pcmk_peer_update.*(lost|memb): %% ", + "crmd.*ccm_event.*(NEW|LOST):.* %% ", ), ( # detail 1 ), diff --git a/shell/modules/report.py b/shell/modules/report.py --- a/shell/modules/report.py +++ b/shell/modules/report.py @@ -589,7 +589,7 @@ class Report(Singleton): except IOError,msg: common_err("open %s: %s"%(fl[0],msg)) continue - pe_l = self.get_transitions([x for x in f], keep_pe_path = True) + pe_l = self.list_transitions([x for x in f], future_pe = True) if pe_l: l.append([node,pe_l]) return l @@ -752,12 +752,13 @@ class Report(Singleton): for n in self.cibnode_l: self.nodecolor[n] = self.nodecolors[i] i = (i+1) % len(self.nodecolors) - def get_transitions(self, msg_l = None, keep_pe_path = False): + def list_transitions(self, msg_l = None, future_pe = False): ''' - Get a list of transitions. + List transitions by reading logs. Empty transitions are skipped. - Some callers need original PE file path (keep_pe_path), - otherwise we produce the path within the report. + Some callers need original PE file path (future_pe), + otherwise we produce the path within the report and check + if the transition files exist. If the caller doesn't provide the message list, then we build it from the collected log files (self.logobj). Otherwise, we get matches for transition patterns. @@ -786,11 +787,18 @@ class Report(Singleton): continue elif num_actions == -1: # couldn't find messages common_warn("could not find number of actions for transition (%s)" % pe_base) - common_debug("found PE input at %s: %s" % (node, pe_file)) - if keep_pe_path: - pe_l.append(pe_file) + if not future_pe: + pe_l_file = os.path.join(self.loc, node, "pengine", pe_base) + if not os.path.isfile(pe_l_file): + warn_once("%s in the logs, but not in the report" % pe_l_file) + continue else: - pe_l.append(os.path.join(self.loc, node, "pengine", pe_base)) + pe_l_file = "%s:%s" % (node, pe_file) + if pe_l_file in pe_l: + common_warn("duplicate %s, replacing older PE file" % pe_l_file) + pe_l.remove(pe_l_file) + common_debug("found PE input: %s" % pe_l_file) + pe_l.append(pe_l_file) return pe_l def report_setup(self): if not self.loc: @@ -802,11 +810,7 @@ class Report(Singleton): self.set_node_colors() self.logobj = LogSyslog(self.central_log, self.log_l, \ self.from_dt, self.to_dt) - self.peinputs_l = self.get_transitions() - for pe_input in self.peinputs_l: - if not os.path.isfile(pe_input): - warn_once("%s in the logs, but not in the report" % pe_input) - self.peinputs_l.remove(pe_input) + self.peinputs_l = self.list_transitions() def prepare_source(self): ''' Unpack a hb_report tarball. @@ -859,7 +863,7 @@ class Report(Singleton): if not args: re_l = mk_re_list(patt_l,"") else: - re_l = mk_re_list(patt_l,r'(%s)\W' % "|".join(args)) + re_l = mk_re_list(patt_l,r'(%s)' % "|".join(args)) return re_l def disp(self, s): 'color output' @@ -886,11 +890,6 @@ class Report(Singleton): self.error("no logs found") return self.display_logs(self.logobj.get_matches(re_l, log_l)) - def match_args(self, cib_l, args): - for a in args: - a_clone = re.sub(r':.*', '', a) - if not (a in cib_l) and not (a_clone in cib_l): - self.warn("%s not found in report, proceeding anyway" % a) def get_desc_line(self,fld): try: f = open(self.desc) @@ -923,8 +922,9 @@ class Report(Singleton): ''' Show all events. ''' - all_re_l = self.build_re("resource",self.cibrsc_l) + \ - self.build_re("node",self.cibnode_l) + all_re_l = self.build_re("resource", self.cibrsc_l) + \ + self.build_re("node", self.cibnode_l) + \ + self.build_re("events", []) if not all_re_l: self.error("no resources or nodes found") return False @@ -940,6 +940,7 @@ class Report(Singleton): te_invoke_patt = transition_patt[0].replace("%%", pe_num) run_patt = transition_patt[1].replace("%%", pe_num) r = None + msg_l.reverse() for msg in msg_l: r = re.search(te_invoke_patt, msg) if r: @@ -1009,7 +1010,6 @@ class Report(Singleton): expanded_l += self.cibgrp_d[a] else: expanded_l.append(a) - self.match_args(self.cibrsc_l,expanded_l) rsc_re_l = self.build_re("resource",expanded_l) if not rsc_re_l: return False @@ -1020,7 +1020,6 @@ class Report(Singleton): ''' if not self.prepare_source(): return False - self.match_args(self.cibnode_l,args) node_re_l = self.build_re("node",args) if not node_re_l: return False diff --git a/shell/modules/ui.py.in b/shell/modules/ui.py.in --- a/shell/modules/ui.py.in +++ b/shell/modules/ui.py.in @@ -1877,16 +1877,16 @@ Examine Pacemaker's history: node and re def _get_pe_byidx(self, idx): l = crm_report.pelist() if len(l) < abs(idx): - common_err("pe input file for index %d not found" % (idx+1)) + common_err("PE input file for index %d not found" % (idx+1)) return None return l[idx] def _get_pe_bynum(self, n): l = crm_report.pelist([n]) if len(l) == 0: - common_err("%s: PE file %d not found" % n) + common_err("PE file %d not found" % n) return None elif len(l) > 1: - common_err("%s: PE file %d ambiguous" % n) + common_err("PE file %d ambiguous" % n) return None return l[0] def transition(self,cmd,*args):