root/lib/pengine/failcounts.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_matched_failure
  2. block_failure
  3. rsc_fail_name
  4. generate_fail_regex
  5. generate_fail_regexes
  6. pe_get_failcount
  7. pe__clear_failcount

   1 /*
   2  * Copyright 2008-2022 the Pacemaker project contributors
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/types.h>
  11 #include <regex.h>
  12 #include <glib.h>
  13 
  14 #include <crm/crm.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 #include <crm/common/util.h>
  18 #include <crm/pengine/internal.h>
  19 
  20 static gboolean
  21 is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml,
     /* [previous][next][first][last][top][bottom][index][help] */
  22                    xmlNode *lrm_op_xml)
  23 {
  24     gboolean matched = FALSE;
  25     const char *conf_op_name = NULL;
  26     const char *lrm_op_task = NULL;
  27     const char *conf_op_interval_spec = NULL;
  28     guint conf_op_interval_ms = 0;
  29     guint lrm_op_interval_ms = 0;
  30     const char *lrm_op_id = NULL;
  31     char *last_failure_key = NULL;
  32 
  33     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
  34         return FALSE;
  35     }
  36 
  37     // Get name and interval from configured op
  38     conf_op_name = crm_element_value(conf_op_xml, "name");
  39     conf_op_interval_spec = crm_element_value(conf_op_xml,
  40                                               XML_LRM_ATTR_INTERVAL);
  41     conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
  42 
  43     // Get name and interval from op history entry
  44     lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
  45     crm_element_value_ms(lrm_op_xml, XML_LRM_ATTR_INTERVAL_MS,
  46                          &lrm_op_interval_ms);
  47 
  48     if ((conf_op_interval_ms != lrm_op_interval_ms)
  49         || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
  50         return FALSE;
  51     }
  52 
  53     lrm_op_id = ID(lrm_op_xml);
  54     last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
  55 
  56     if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
  57         matched = TRUE;
  58 
  59     } else {
  60         char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
  61                                                 conf_op_interval_ms);
  62 
  63         if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
  64             int rc = 0;
  65             int target_rc = pe__target_rc_from_xml(lrm_op_xml);
  66 
  67             crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
  68             if (rc != target_rc) {
  69                 matched = TRUE;
  70             }
  71         }
  72         free(expected_op_key);
  73     }
  74 
  75     free(last_failure_key);
  76     return matched;
  77 }
  78 
  79 static gboolean
  80 block_failure(pe_node_t *node, pe_resource_t *rsc, xmlNode *xml_op,
     /* [previous][next][first][last][top][bottom][index][help] */
  81               pe_working_set_t *data_set)
  82 {
  83     char *xml_name = clone_strip(rsc->id);
  84 
  85     /* @TODO This xpath search occurs after template expansion, but it is unable
  86      * to properly detect on-fail in id-ref, operation meta-attributes, or
  87      * op_defaults, or evaluate rules.
  88      *
  89      * Also, on-fail defaults to block (in unpack_operation()) for stop actions
  90      * when stonith is disabled.
  91      *
  92      * Ideally, we'd unpack the operation before this point, and pass in a
  93      * meta-attributes table that takes all that into consideration.
  94      */
  95     char *xpath = crm_strdup_printf("//primitive[@id='%s']//op[@on-fail='block']",
  96                                     xml_name);
  97 
  98     xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
  99     gboolean should_block = FALSE;
 100 
 101     free(xpath);
 102 
 103     if (xpathObj) {
 104         int max = numXpathResults(xpathObj);
 105         int lpc = 0;
 106 
 107         for (lpc = 0; lpc < max; lpc++) {
 108             xmlNode *pref = getXpathResult(xpathObj, lpc);
 109 
 110             if (xml_op) {
 111                 should_block = is_matched_failure(xml_name, pref, xml_op);
 112                 if (should_block) {
 113                     break;
 114                 }
 115 
 116             } else {
 117                 const char *conf_op_name = NULL;
 118                 const char *conf_op_interval_spec = NULL;
 119                 guint conf_op_interval_ms = 0;
 120                 char *lrm_op_xpath = NULL;
 121                 xmlXPathObject *lrm_op_xpathObj = NULL;
 122 
 123                 // Get name and interval from configured op
 124                 conf_op_name = crm_element_value(pref, "name");
 125                 conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
 126                 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
 127 
 128                 lrm_op_xpath = crm_strdup_printf("//node_state[@uname='%s']"
 129                                                "//lrm_resource[@id='%s']"
 130                                                "/lrm_rsc_op[@operation='%s'][@interval='%u']",
 131                                                node->details->uname, xml_name,
 132                                                conf_op_name, conf_op_interval_ms);
 133                 lrm_op_xpathObj = xpath_search(data_set->input, lrm_op_xpath);
 134 
 135                 free(lrm_op_xpath);
 136 
 137                 if (lrm_op_xpathObj) {
 138                     int max2 = numXpathResults(lrm_op_xpathObj);
 139                     int lpc2 = 0;
 140 
 141                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
 142                         xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
 143                                                              lpc2);
 144 
 145                         should_block = is_matched_failure(xml_name, pref,
 146                                                           lrm_op_xml);
 147                         if (should_block) {
 148                             break;
 149                         }
 150                     }
 151                 }
 152                 freeXpathObject(lrm_op_xpathObj);
 153 
 154                 if (should_block) {
 155                     break;
 156                 }
 157             }
 158         }
 159     }
 160 
 161     free(xml_name);
 162     freeXpathObject(xpathObj);
 163 
 164     return should_block;
 165 }
 166 
 167 /*!
 168  * \internal
 169  * \brief Get resource name as used in failure-related node attributes
 170  *
 171  * \param[in] rsc  Resource to check
 172  *
 173  * \return Newly allocated string containing resource's fail name
 174  * \note The caller is responsible for freeing the result.
 175  */
 176 static inline char *
 177 rsc_fail_name(pe_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 178 {
 179     const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
 180 
 181     return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
 182 }
 183 
 184 /*!
 185  * \internal
 186  * \brief Compile regular expression to match a failure-related node attribute
 187  *
 188  * \param[in]  prefix    Attribute prefix to match
 189  * \param[in]  rsc_name  Resource name to match as used in failure attributes
 190  * \param[in]  is_legacy Whether DC uses per-resource fail counts
 191  * \param[in]  is_unique Whether the resource is a globally unique clone
 192  * \param[out] re        Where to store resulting regular expression
 193  *
 194  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
 195  *       The caller is responsible for freeing re with regfree().
 196  */
 197 static void
 198 generate_fail_regex(const char *prefix, const char *rsc_name,
     /* [previous][next][first][last][top][bottom][index][help] */
 199                     gboolean is_legacy, gboolean is_unique, regex_t *re)
 200 {
 201     char *pattern;
 202 
 203     /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
 204      * per-operation.
 205      */
 206     const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
 207 
 208     /* Ignore instance numbers for anything other than globally unique clones.
 209      * Anonymous clone fail counts could contain an instance number if the
 210      * clone was initially unique, failed, then was converted to anonymous.
 211      * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
 212      * clone instance numbers.
 213      */
 214     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
 215 
 216     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
 217                                 instance_pattern, op_pattern);
 218     CRM_LOG_ASSERT(regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) == 0);
 219     free(pattern);
 220 }
 221 
 222 /*!
 223  * \internal
 224  * \brief Compile regular expressions to match failure-related node attributes
 225  *
 226  * \param[in]  rsc             Resource being checked for failures
 227  * \param[in]  data_set        Data set (for CRM feature set version)
 228  * \param[out] failcount_re    Storage for regular expression for fail count
 229  * \param[out] lastfailure_re  Storage for regular expression for last failure
 230  *
 231  * \note The caller is responsible for freeing the expressions with regfree().
 232  */
 233 static void
 234 generate_fail_regexes(pe_resource_t *rsc, pe_working_set_t *data_set,
     /* [previous][next][first][last][top][bottom][index][help] */
 235                       regex_t *failcount_re, regex_t *lastfailure_re)
 236 {
 237     char *rsc_name = rsc_fail_name(rsc);
 238     const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
 239     gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
 240 
 241     generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
 242                         pcmk_is_set(rsc->flags, pe_rsc_unique), failcount_re);
 243 
 244     generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name, is_legacy,
 245                         pcmk_is_set(rsc->flags, pe_rsc_unique), lastfailure_re);
 246 
 247     free(rsc_name);
 248 }
 249 
 250 int
 251 pe_get_failcount(pe_node_t *node, pe_resource_t *rsc, time_t *last_failure,
     /* [previous][next][first][last][top][bottom][index][help] */
 252                  uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
 253 {
 254     char *key = NULL;
 255     const char *value = NULL;
 256     regex_t failcount_re, lastfailure_re;
 257     int failcount = 0;
 258     time_t last = 0;
 259     GHashTableIter iter;
 260 
 261     generate_fail_regexes(rsc, data_set, &failcount_re, &lastfailure_re);
 262 
 263     /* Resource fail count is sum of all matching operation fail counts */
 264     g_hash_table_iter_init(&iter, node->details->attrs);
 265     while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
 266         if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
 267             failcount = pcmk__add_scores(failcount, char2score(value));
 268         } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
 269             long long last_ll;
 270 
 271             if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
 272                 last = (time_t) QB_MAX(last, last_ll);
 273             }
 274         }
 275     }
 276 
 277     regfree(&failcount_re);
 278     regfree(&lastfailure_re);
 279 
 280     if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
 281         *last_failure = last;
 282     }
 283 
 284     /* If failure blocks the resource, disregard any failure timeout */
 285     if ((failcount > 0) && rsc->failure_timeout
 286         && block_failure(node, rsc, xml_op, data_set)) {
 287 
 288         pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
 289                 rsc->failure_timeout, rsc->id);
 290         rsc->failure_timeout = 0;
 291     }
 292 
 293     /* If all failures have expired, ignore fail count */
 294     if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
 295         && rsc->failure_timeout) {
 296 
 297         time_t now = get_effective_time(data_set);
 298 
 299         if (now > (last + rsc->failure_timeout)) {
 300             crm_debug("Failcount for %s on %s expired after %ds",
 301                       rsc->id, pe__node_name(node), rsc->failure_timeout);
 302             failcount = 0;
 303         }
 304     }
 305 
 306     /* We never want the fail counts of a bundle container's fillers to
 307      * count towards the container's fail count.
 308      *
 309      * Most importantly, a Pacemaker Remote connection to a bundle container
 310      * is a filler of the container, but can reside on a different node than the
 311      * container itself. Counting its fail count on its node towards the
 312      * container's fail count on that node could lead to attempting to stop the
 313      * container on the wrong node.
 314      */
 315 
 316     if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers
 317         && !pe_rsc_is_bundled(rsc)) {
 318 
 319         GList *gIter = NULL;
 320 
 321         for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
 322             pe_resource_t *filler = (pe_resource_t *) gIter->data;
 323             time_t filler_last_failure = 0;
 324 
 325             failcount += pe_get_failcount(node, filler, &filler_last_failure,
 326                                           flags, xml_op, data_set);
 327 
 328             if (last_failure && filler_last_failure > *last_failure) {
 329                 *last_failure = filler_last_failure;
 330             }
 331         }
 332 
 333         if (failcount > 0) {
 334             crm_info("Container %s and the resources within it "
 335                      "have failed %s time%s on %s",
 336                      rsc->id, pcmk_readable_score(failcount),
 337                      pcmk__plural_s(failcount), pe__node_name(node));
 338         }
 339 
 340     } else if (failcount > 0) {
 341         crm_info("%s has failed %s time%s on %s",
 342                  rsc->id, pcmk_readable_score(failcount),
 343                  pcmk__plural_s(failcount), pe__node_name(node));
 344     }
 345 
 346     return failcount;
 347 }
 348 
 349 /*!
 350  * \brief Schedule a controller operation to clear a fail count
 351  *
 352  * \param[in] rsc       Resource with failure
 353  * \param[in] node      Node failure occurred on
 354  * \param[in] reason    Readable description why needed (for logging)
 355  * \param[in] data_set  Working set for cluster
 356  *
 357  * \return Scheduled action
 358  */
 359 pe_action_t *
 360 pe__clear_failcount(pe_resource_t *rsc, pe_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 361                     const char *reason, pe_working_set_t *data_set)
 362 {
 363     char *key = NULL;
 364     pe_action_t *clear = NULL;
 365 
 366     CRM_CHECK(rsc && node && reason && data_set, return NULL);
 367 
 368     key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
 369     clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
 370                           data_set);
 371     add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
 372     crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
 373                rsc->id, pe__node_name(node), reason, clear->uuid);
 374     return clear;
 375 }

/* [previous][next][first][last][top][bottom][index][help] */