root/daemons/fenced/fenced_remote.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sort_strings
  2. free_remote_query
  3. free_stonith_remote_op_list
  4. count_peer_device
  5. count_peer_devices
  6. find_peer_device
  7. grab_peer_device
  8. clear_remote_op_timers
  9. free_remote_op
  10. init_stonith_remote_op_hash_table
  11. op_requested_action
  12. op_phase_off
  13. op_phase_on
  14. undo_op_remap
  15. fencing_result2xml
  16. fenced_broadcast_op_result
  17. handle_local_reply_and_notify
  18. finalize_op_duplicates
  19. delegate_from_xml
  20. finalize_op
  21. remote_op_watchdog_done
  22. remote_op_timeout_one
  23. finalize_timed_out_op
  24. remote_op_timeout
  25. remote_op_query_timeout
  26. topology_is_empty
  27. add_required_device
  28. remove_required_device
  29. set_op_device_list
  30. topology_matches
  31. find_topology_for_host
  32. advance_topology_level
  33. merge_duplicates
  34. fencing_active_peers
  35. fenced_handle_manual_confirmation
  36. create_remote_stonith_op
  37. initiate_remote_stonith_op
  38. find_best_peer
  39. stonith_choose_peer
  40. get_device_timeout
  41. add_device_timeout
  42. get_peer_timeout
  43. get_op_total_timeout
  44. report_timeout_period
  45. advance_topology_device_in_level
  46. check_watchdog_fencing_and_wait
  47. request_peer_fencing
  48. sort_peers
  49. all_topology_devices_found
  50. parse_action_specific
  51. add_device_properties
  52. add_result
  53. process_remote_stonith_query
  54. fenced_process_fencing_reply
  55. stonith_check_fence_tolerance

   1 /*
   2  * Copyright 2009-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <stdio.h>
  14 #include <sys/types.h>
  15 #include <sys/wait.h>
  16 #include <sys/stat.h>
  17 #include <unistd.h>
  18 #include <sys/utsname.h>
  19 
  20 #include <stdlib.h>
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <ctype.h>
  24 #include <regex.h>
  25 
  26 #include <crm/crm.h>
  27 #include <crm/msg_xml.h>
  28 #include <crm/common/ipc.h>
  29 #include <crm/common/ipc_internal.h>
  30 #include <crm/cluster/internal.h>
  31 
  32 #include <crm/stonith-ng.h>
  33 #include <crm/fencing/internal.h>
  34 #include <crm/common/xml.h>
  35 #include <crm/common/xml_internal.h>
  36 
  37 #include <crm/common/util.h>
  38 #include <pacemaker-fenced.h>
  39 
  40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
  41 
  42 /* When one fencer queries its peers for devices able to handle a fencing
  43  * request, each peer will reply with a list of such devices available to it.
  44  * Each reply will be parsed into a peer_device_info_t, with each device's
  45  * information kept in a device_properties_t.
  46  */
  47 
  48 typedef struct device_properties_s {
  49     /* Whether access to this device has been verified */
  50     gboolean verified;
  51 
  52     /* The remaining members are indexed by the operation's "phase" */
  53 
  54     /* Whether this device has been executed in each phase */
  55     gboolean executed[st_phase_max];
  56     /* Whether this device is disallowed from executing in each phase */
  57     gboolean disallowed[st_phase_max];
  58     /* Action-specific timeout for each phase */
  59     int custom_action_timeout[st_phase_max];
  60     /* Action-specific maximum random delay for each phase */
  61     int delay_max[st_phase_max];
  62     /* Action-specific base delay for each phase */
  63     int delay_base[st_phase_max];
  64     /* Group of enum st_device_flags */
  65     uint32_t device_support_flags;
  66 } device_properties_t;
  67 
  68 typedef struct {
  69     /* Name of peer that sent this result */
  70     char *host;
  71     /* Only try peers for non-topology based operations once */
  72     gboolean tried;
  73     /* Number of entries in the devices table */
  74     int ndevices;
  75     /* Devices available to this host that are capable of fencing the target */
  76     GHashTable *devices;
  77 } peer_device_info_t;
  78 
  79 GHashTable *stonith_remote_op_list = NULL;
  80 
  81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
  82                                   int call_options);
  83 
  84 static void request_peer_fencing(remote_fencing_op_t *op,
  85                                  peer_device_info_t *peer);
  86 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
  87 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
  88 static int get_op_total_timeout(const remote_fencing_op_t *op,
  89                                 const peer_device_info_t *chosen_peer);
  90 
  91 static gint
  92 sort_strings(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94     return strcmp(a, b);
  95 }
  96 
  97 static void
  98 free_remote_query(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
  99 {
 100     if (data != NULL) {
 101         peer_device_info_t *peer = data;
 102 
 103         g_hash_table_destroy(peer->devices);
 104         free(peer->host);
 105         free(peer);
 106     }
 107 }
 108 
 109 void
 110 free_stonith_remote_op_list(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 111 {
 112     if (stonith_remote_op_list != NULL) {
 113         g_hash_table_destroy(stonith_remote_op_list);
 114         stonith_remote_op_list = NULL;
 115     }
 116 }
 117 
 118 struct peer_count_data {
 119     const remote_fencing_op_t *op;
 120     gboolean verified_only;
 121     uint32_t support_action_only;
 122     int count;
 123 };
 124 
 125 /*!
 126  * \internal
 127  * \brief Increment a counter if a device has not been executed yet
 128  *
 129  * \param[in]     key        Device ID (ignored)
 130  * \param[in]     value      Device properties
 131  * \param[in,out] user_data  Peer count data
 132  */
 133 static void
 134 count_peer_device(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 135 {
 136     device_properties_t *props = (device_properties_t*)value;
 137     struct peer_count_data *data = user_data;
 138 
 139     if (!props->executed[data->op->phase]
 140         && (!data->verified_only || props->verified)
 141         && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
 142         ++(data->count);
 143     }
 144 }
 145 
 146 /*!
 147  * \internal
 148  * \brief Check the number of available devices in a peer's query results
 149  *
 150  * \param[in] op             Operation that results are for
 151  * \param[in] peer           Peer to count
 152  * \param[in] verified_only  Whether to count only verified devices
 153  * \param[in] support_action_only Whether to count only devices that support action
 154  *
 155  * \return Number of devices available to peer that were not already executed
 156  */
 157 static int
 158 count_peer_devices(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
 159                    const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
 160 {
 161     struct peer_count_data data;
 162 
 163     data.op = op;
 164     data.verified_only = verified_only;
 165     data.support_action_only = support_on_action_only;
 166     data.count = 0;
 167     if (peer) {
 168         g_hash_table_foreach(peer->devices, count_peer_device, &data);
 169     }
 170     return data.count;
 171 }
 172 
 173 /*!
 174  * \internal
 175  * \brief Search for a device in a query result
 176  *
 177  * \param[in] op      Operation that result is for
 178  * \param[in] peer    Query result for a peer
 179  * \param[in] device  Device ID to search for
 180  *
 181  * \return Device properties if found, NULL otherwise
 182  */
 183 static device_properties_t *
 184 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 185                  const char *device, uint32_t support_action_only)
 186 {
 187     device_properties_t *props = g_hash_table_lookup(peer->devices, device);
 188 
 189     if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
 190         return NULL;
 191     }
 192     return (props && !props->executed[op->phase]
 193            && !props->disallowed[op->phase])? props : NULL;
 194 }
 195 
 196 /*!
 197  * \internal
 198  * \brief Find a device in a peer's device list and mark it as executed
 199  *
 200  * \param[in]     op                     Operation that peer result is for
 201  * \param[in,out] peer                   Peer with results to search
 202  * \param[in]     device                 ID of device to mark as done
 203  * \param[in]     verified_devices_only  Only consider verified devices
 204  *
 205  * \return TRUE if device was found and marked, FALSE otherwise
 206  */
 207 static gboolean
 208 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
     /* [previous][next][first][last][top][bottom][index][help] */
 209                  const char *device, gboolean verified_devices_only)
 210 {
 211     device_properties_t *props = find_peer_device(op, peer, device,
 212                                                   fenced_support_flag(op->action));
 213 
 214     if ((props == NULL) || (verified_devices_only && !props->verified)) {
 215         return FALSE;
 216     }
 217 
 218     crm_trace("Removing %s from %s (%d remaining)",
 219               device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
 220     props->executed[op->phase] = TRUE;
 221     return TRUE;
 222 }
 223 
 224 static void
 225 clear_remote_op_timers(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 226 {
 227     if (op->query_timer) {
 228         g_source_remove(op->query_timer);
 229         op->query_timer = 0;
 230     }
 231     if (op->op_timer_total) {
 232         g_source_remove(op->op_timer_total);
 233         op->op_timer_total = 0;
 234     }
 235     if (op->op_timer_one) {
 236         g_source_remove(op->op_timer_one);
 237         op->op_timer_one = 0;
 238     }
 239 }
 240 
 241 static void
 242 free_remote_op(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 243 {
 244     remote_fencing_op_t *op = data;
 245 
 246     crm_log_xml_debug(op->request, "Destroying");
 247 
 248     clear_remote_op_timers(op);
 249 
 250     free(op->id);
 251     free(op->action);
 252     free(op->delegate);
 253     free(op->target);
 254     free(op->client_id);
 255     free(op->client_name);
 256     free(op->originator);
 257 
 258     if (op->query_results) {
 259         g_list_free_full(op->query_results, free_remote_query);
 260     }
 261     if (op->request) {
 262         free_xml(op->request);
 263         op->request = NULL;
 264     }
 265     if (op->devices_list) {
 266         g_list_free_full(op->devices_list, free);
 267         op->devices_list = NULL;
 268     }
 269     g_list_free_full(op->automatic_list, free);
 270     g_list_free(op->duplicates);
 271 
 272     pcmk__reset_result(&op->result);
 273     free(op);
 274 }
 275 
 276 void
 277 init_stonith_remote_op_hash_table(GHashTable **table)
     /* [previous][next][first][last][top][bottom][index][help] */
 278 {
 279     if (*table == NULL) {
 280         *table = pcmk__strkey_table(NULL, free_remote_op);
 281     }
 282 }
 283 
 284 /*!
 285  * \internal
 286  * \brief Return an operation's originally requested action (before any remap)
 287  *
 288  * \param[in] op  Operation to check
 289  *
 290  * \return Operation's original action
 291  */
 292 static const char *
 293 op_requested_action(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 294 {
 295     return ((op->phase > st_phase_requested)? "reboot" : op->action);
 296 }
 297 
 298 /*!
 299  * \internal
 300  * \brief Remap a "reboot" operation to the "off" phase
 301  *
 302  * \param[in,out] op      Operation to remap
 303  */
 304 static void
 305 op_phase_off(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 306 {
 307     crm_info("Remapping multiple-device reboot targeting %s to 'off' "
 308              CRM_XS " id=%.8s", op->target, op->id);
 309     op->phase = st_phase_off;
 310 
 311     /* Happily, "off" and "on" are shorter than "reboot", so we can reuse the
 312      * memory allocation at each phase.
 313      */
 314     strcpy(op->action, "off");
 315 }
 316 
 317 /*!
 318  * \internal
 319  * \brief Advance a remapped reboot operation to the "on" phase
 320  *
 321  * \param[in,out] op  Operation to remap
 322  */
 323 static void
 324 op_phase_on(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 325 {
 326     GList *iter = NULL;
 327 
 328     crm_info("Remapped 'off' targeting %s complete, "
 329              "remapping to 'on' for %s " CRM_XS " id=%.8s",
 330              op->target, op->client_name, op->id);
 331     op->phase = st_phase_on;
 332     strcpy(op->action, "on");
 333 
 334     /* Skip devices with automatic unfencing, because the cluster will handle it
 335      * when the node rejoins.
 336      */
 337     for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
 338         GList *match = g_list_find_custom(op->devices_list, iter->data,
 339                                             sort_strings);
 340 
 341         if (match) {
 342             op->devices_list = g_list_remove(op->devices_list, match->data);
 343         }
 344     }
 345     g_list_free_full(op->automatic_list, free);
 346     op->automatic_list = NULL;
 347 
 348     /* Rewind device list pointer */
 349     op->devices = op->devices_list;
 350 }
 351 
 352 /*!
 353  * \internal
 354  * \brief Reset a remapped reboot operation
 355  *
 356  * \param[in,out] op  Operation to reset
 357  */
 358 static void
 359 undo_op_remap(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 360 {
 361     if (op->phase > 0) {
 362         crm_info("Undoing remap of reboot targeting %s for %s "
 363                  CRM_XS " id=%.8s", op->target, op->client_name, op->id);
 364         op->phase = st_phase_requested;
 365         strcpy(op->action, "reboot");
 366     }
 367 }
 368 
 369 /*!
 370  * \internal
 371  * \brief Create notification data XML for a fencing operation result
 372  *
 373  * \param[in] op      Fencer operation that completed
 374  *
 375  * \return Newly created XML to add as notification data
 376  * \note The caller is responsible for freeing the result.
 377  */
 378 static xmlNode *
 379 fencing_result2xml(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381     xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
 382 
 383     crm_xml_add_int(notify_data, "state", op->state);
 384     crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
 385     crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
 386     crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
 387     crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
 388     crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
 389     crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
 390     crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
 391 
 392     return notify_data;
 393 }
 394 
 395 /*!
 396  * \internal
 397  * \brief Broadcast a fence result notification to all CPG peers
 398  *
 399  * \param[in] op         Fencer operation that completed
 400  * \param[in] op_merged  Whether this operation is a duplicate of another
 401  */
 402 void
 403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
     /* [previous][next][first][last][top][bottom][index][help] */
 404 {
 405     static int count = 0;
 406     xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
 407     xmlNode *notify_data = fencing_result2xml(op);
 408 
 409     count++;
 410     crm_trace("Broadcasting result to peers");
 411     crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
 412     crm_xml_add(bcast, F_SUBTYPE, "broadcast");
 413     crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
 414     crm_xml_add_int(bcast, "count", count);
 415 
 416     if (op_merged) {
 417         pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
 418     }
 419 
 420     stonith__xe_set_result(notify_data, &op->result);
 421 
 422     add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
 423     send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
 424     free_xml(notify_data);
 425     free_xml(bcast);
 426 
 427     return;
 428 }
 429 
 430 /*!
 431  * \internal
 432  * \brief Reply to a local request originator and notify all subscribed clients
 433  *
 434  * \param[in,out] op    Fencer operation that completed
 435  * \param[in,out] data  Top-level XML to add notification to
 436  */
 437 static void
 438 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 439 {
 440     xmlNode *notify_data = NULL;
 441     xmlNode *reply = NULL;
 442     pcmk__client_t *client = NULL;
 443 
 444     if (op->notify_sent == TRUE) {
 445         /* nothing to do */
 446         return;
 447     }
 448 
 449     /* Do notification with a clean data object */
 450     crm_xml_add_int(data, "state", op->state);
 451     crm_xml_add(data, F_STONITH_TARGET, op->target);
 452     crm_xml_add(data, F_STONITH_OPERATION, op->action);
 453 
 454     reply = fenced_construct_reply(op->request, data, &op->result);
 455     crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
 456 
 457     /* Send fencing OP reply to local client that initiated fencing */
 458     client = pcmk__find_client_by_id(op->client_id);
 459     if (client == NULL) {
 460         crm_trace("Skipping reply to %s: no longer a client", op->client_id);
 461     } else {
 462         do_local_reply(reply, client, op->call_options);
 463     }
 464 
 465     /* bcast to all local clients that the fencing operation happend */
 466     notify_data = fencing_result2xml(op);
 467     fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
 468     free_xml(notify_data);
 469     fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
 470 
 471     /* mark this op as having notify's already sent */
 472     op->notify_sent = TRUE;
 473     free_xml(reply);
 474 }
 475 
 476 /*!
 477  * \internal
 478  * \brief Finalize all duplicates of a given fencer operation
 479  *
 480  * \param[in,out] op    Fencer operation that completed
 481  * \param[in,out] data  Top-level XML to add notification to
 482  */
 483 static void
 484 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 485 {
 486     for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
 487         remote_fencing_op_t *other = iter->data;
 488 
 489         if (other->state == st_duplicate) {
 490             other->state = op->state;
 491             crm_debug("Performing duplicate notification for %s@%s: %s "
 492                       CRM_XS " id=%.8s",
 493                       other->client_name, other->originator,
 494                       pcmk_exec_status_str(op->result.execution_status),
 495                       other->id);
 496             pcmk__copy_result(&op->result, &other->result);
 497             finalize_op(other, data, true);
 498 
 499         } else {
 500             // Possible if (for example) it timed out already
 501             crm_err("Skipping duplicate notification for %s@%s "
 502                     CRM_XS " state=%s id=%.8s",
 503                     other->client_name, other->originator,
 504                     stonith_op_state_str(other->state), other->id);
 505         }
 506     }
 507 }
 508 
 509 static char *
 510 delegate_from_xml(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
 511 {
 512     xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
 513 
 514     if (match == NULL) {
 515         return crm_element_value_copy(xml, F_ORIG);
 516     } else {
 517         return crm_element_value_copy(match, F_STONITH_DELEGATE);
 518     }
 519 }
 520 
 521 /*!
 522  * \internal
 523  * \brief Finalize a peer fencing operation
 524  *
 525  * Clean up after a fencing operation completes. This function has two code
 526  * paths: the executioner uses it to broadcast the result to CPG peers, and then
 527  * each peer (including the executioner) uses it to process that broadcast and
 528  * notify its IPC clients of the result.
 529  *
 530  * \param[in,out] op      Fencer operation that completed
 531  * \param[in,out] data    If not NULL, XML reply of last delegated operation
 532  * \param[in]     dup     Whether this operation is a duplicate of another
 533  *                        (in which case, do not broadcast the result)
 534  *
 535  *  \note The operation result should be set before calling this function.
 536  */
 537 static void
 538 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
     /* [previous][next][first][last][top][bottom][index][help] */
 539 {
 540     int level = LOG_ERR;
 541     const char *subt = NULL;
 542     xmlNode *local_data = NULL;
 543     gboolean op_merged = FALSE;
 544 
 545     CRM_CHECK((op != NULL), return);
 546 
 547     // This is a no-op if timers have already been cleared
 548     clear_remote_op_timers(op);
 549 
 550     if (op->notify_sent) {
 551         // Most likely, this is a timed-out action that eventually completed
 552         crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
 553                    "Result arrived too late " CRM_XS " id=%.8s",
 554                    op->action, (op->target? " targeting " : ""),
 555                    (op->target? op->target : ""),
 556                    (op->delegate? op->delegate : "unknown node"),
 557                    op->client_name, op->originator,
 558                    (op_merged? " (merged)" : ""),
 559                    op->id);
 560         return;
 561     }
 562 
 563     set_fencing_completed(op);
 564     undo_op_remap(op);
 565 
 566     if (data == NULL) {
 567         data = create_xml_node(NULL, "remote-op");
 568         local_data = data;
 569 
 570     } else if (op->delegate == NULL) {
 571         switch (op->result.execution_status) {
 572             case PCMK_EXEC_NO_FENCE_DEVICE:
 573                 break;
 574 
 575             case PCMK_EXEC_INVALID:
 576                 if (op->result.exit_status != CRM_EX_EXPIRED) {
 577                     op->delegate = delegate_from_xml(data);
 578                 }
 579                 break;
 580 
 581             default:
 582                 op->delegate = delegate_from_xml(data);
 583                 break;
 584         }
 585     }
 586 
 587     if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
 588         op_merged = true;
 589     }
 590 
 591     /* Tell everyone the operation is done, we will continue
 592      * with doing the local notifications once we receive
 593      * the broadcast back. */
 594     subt = crm_element_value(data, F_SUBTYPE);
 595     if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
 596         /* Defer notification until the bcast message arrives */
 597         fenced_broadcast_op_result(op, op_merged);
 598         free_xml(local_data);
 599         return;
 600     }
 601 
 602     if (pcmk__result_ok(&op->result) || dup
 603         || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
 604         level = LOG_NOTICE;
 605     }
 606     do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
 607                CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
 608                (op->target? op->target : ""),
 609                (op->delegate? op->delegate : "unknown node"),
 610                op->client_name, op->originator,
 611                (op_merged? " (merged)" : ""),
 612                crm_exit_str(op->result.exit_status),
 613                pcmk_exec_status_str(op->result.execution_status),
 614                ((op->result.exit_reason == NULL)? "" : ": "),
 615                ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
 616                op->id);
 617 
 618     handle_local_reply_and_notify(op, data);
 619 
 620     if (!dup) {
 621         finalize_op_duplicates(op, data);
 622     }
 623 
 624     /* Free non-essential parts of the record
 625      * Keep the record around so we can query the history
 626      */
 627     if (op->query_results) {
 628         g_list_free_full(op->query_results, free_remote_query);
 629         op->query_results = NULL;
 630     }
 631     if (op->request) {
 632         free_xml(op->request);
 633         op->request = NULL;
 634     }
 635 
 636     free_xml(local_data);
 637 }
 638 
 639 /*!
 640  * \internal
 641  * \brief Finalize a watchdog fencer op after the waiting time expires
 642  *
 643  * \param[in,out] userdata  Fencer operation that completed
 644  *
 645  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 646  */
 647 static gboolean
 648 remote_op_watchdog_done(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 649 {
 650     remote_fencing_op_t *op = userdata;
 651 
 652     op->op_timer_one = 0;
 653 
 654     crm_notice("Self-fencing (%s) by %s for %s assumed complete "
 655                CRM_XS " id=%.8s",
 656                op->action, op->target, op->client_name, op->id);
 657     op->state = st_done;
 658     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 659     finalize_op(op, NULL, false);
 660     return G_SOURCE_REMOVE;
 661 }
 662 
 663 static gboolean
 664 remote_op_timeout_one(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 665 {
 666     remote_fencing_op_t *op = userdata;
 667 
 668     op->op_timer_one = 0;
 669 
 670     crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
 671                " id=%.8s", op->action, op->target, op->client_name, op->id);
 672     pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
 673                      "Peer did not return fence result within timeout");
 674 
 675     // The requested delay has been applied for the first device
 676     if (op->delay > 0) {
 677         op->delay = 0;
 678         crm_trace("Try another device for '%s' action targeting %s "
 679                   "for client %s without delay " CRM_XS " id=%.8s",
 680                   op->action, op->target, op->client_name, op->id);
 681     }
 682 
 683     // Try another device, if appropriate
 684     request_peer_fencing(op, NULL);
 685     return G_SOURCE_REMOVE;
 686 }
 687 
 688 /*!
 689  * \internal
 690  * \brief Finalize a remote fencer operation that timed out
 691  *
 692  * \param[in,out] op      Fencer operation that timed out
 693  * \param[in]     reason  Readable description of what step timed out
 694  */
 695 static void
 696 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
     /* [previous][next][first][last][top][bottom][index][help] */
 697 {
 698     crm_debug("Action '%s' targeting %s for client %s timed out "
 699               CRM_XS " id=%.8s",
 700               op->action, op->target, op->client_name, op->id);
 701 
 702     if (op->phase == st_phase_on) {
 703         /* A remapped reboot operation timed out in the "on" phase, but the
 704          * "off" phase completed successfully, so quit trying any further
 705          * devices, and return success.
 706          */
 707         op->state = st_done;
 708         pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
 709     } else {
 710         op->state = st_failed;
 711         pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
 712     }
 713     finalize_op(op, NULL, false);
 714 }
 715 
 716 /*!
 717  * \internal
 718  * \brief Finalize a remote fencer operation that timed out
 719  *
 720  * \param[in,out] userdata  Fencer operation that timed out
 721  *
 722  * \return G_SOURCE_REMOVE (which tells glib not to restart timer)
 723  */
 724 static gboolean
 725 remote_op_timeout(gpointer userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 726 {
 727     remote_fencing_op_t *op = userdata;
 728 
 729     op->op_timer_total = 0;
 730 
 731     if (op->state == st_done) {
 732         crm_debug("Action '%s' targeting %s for client %s already completed "
 733                   CRM_XS " id=%.8s",
 734                   op->action, op->target, op->client_name, op->id);
 735     } else {
 736         finalize_timed_out_op(userdata, "Fencing did not complete within a "
 737                                         "total timeout based on the "
 738                                         "configured timeout and retries for "
 739                                         "any devices attempted");
 740     }
 741     return G_SOURCE_REMOVE;
 742 }
 743 
 744 static gboolean
 745 remote_op_query_timeout(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 746 {
 747     remote_fencing_op_t *op = data;
 748 
 749     op->query_timer = 0;
 750 
 751     if (op->state == st_done) {
 752         crm_debug("Operation %.8s targeting %s already completed",
 753                   op->id, op->target);
 754     } else if (op->state == st_exec) {
 755         crm_debug("Operation %.8s targeting %s already in progress",
 756                   op->id, op->target);
 757     } else if (op->query_results) {
 758         // Query succeeded, so attempt the actual fencing
 759         crm_debug("Query %.8s targeting %s complete (state=%s)",
 760                   op->id, op->target, stonith_op_state_str(op->state));
 761         request_peer_fencing(op, NULL);
 762     } else {
 763         crm_debug("Query %.8s targeting %s timed out (state=%s)",
 764                   op->id, op->target, stonith_op_state_str(op->state));
 765         finalize_timed_out_op(op, "No capable peers replied to device query "
 766                                   "within timeout");
 767     }
 768 
 769     return G_SOURCE_REMOVE;
 770 }
 771 
 772 static gboolean
 773 topology_is_empty(stonith_topology_t *tp)
     /* [previous][next][first][last][top][bottom][index][help] */
 774 {
 775     int i;
 776 
 777     if (tp == NULL) {
 778         return TRUE;
 779     }
 780 
 781     for (i = 0; i < ST_LEVEL_MAX; i++) {
 782         if (tp->levels[i] != NULL) {
 783             return FALSE;
 784         }
 785     }
 786     return TRUE;
 787 }
 788 
 789 /*!
 790  * \internal
 791  * \brief Add a device to an operation's automatic unfencing list
 792  *
 793  * \param[in,out] op      Operation to modify
 794  * \param[in]     device  Device ID to add
 795  */
 796 static void
 797 add_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 798 {
 799     GList *match  = g_list_find_custom(op->automatic_list, device,
 800                                          sort_strings);
 801 
 802     if (!match) {
 803         op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
 804     }
 805 }
 806 
 807 /*!
 808  * \internal
 809  * \brief Remove a device from the automatic unfencing list
 810  *
 811  * \param[in,out] op      Operation to modify
 812  * \param[in]     device  Device ID to remove
 813  */
 814 static void
 815 remove_required_device(remote_fencing_op_t *op, const char *device)
     /* [previous][next][first][last][top][bottom][index][help] */
 816 {
 817     GList *match = g_list_find_custom(op->automatic_list, device,
 818                                         sort_strings);
 819 
 820     if (match) {
 821         op->automatic_list = g_list_remove(op->automatic_list, match->data);
 822     }
 823 }
 824 
 825 /* deep copy the device list */
 826 static void
 827 set_op_device_list(remote_fencing_op_t * op, GList *devices)
     /* [previous][next][first][last][top][bottom][index][help] */
 828 {
 829     GList *lpc = NULL;
 830 
 831     if (op->devices_list) {
 832         g_list_free_full(op->devices_list, free);
 833         op->devices_list = NULL;
 834     }
 835     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
 836         op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
 837     }
 838     op->devices = op->devices_list;
 839 }
 840 
 841 /*!
 842  * \internal
 843  * \brief Check whether a node matches a topology target
 844  *
 845  * \param[in] tp    Topology table entry to check
 846  * \param[in] node  Name of node to check
 847  *
 848  * \return TRUE if node matches topology target
 849  */
 850 static gboolean
 851 topology_matches(const stonith_topology_t *tp, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 852 {
 853     regex_t r_patt;
 854 
 855     CRM_CHECK(node && tp && tp->target, return FALSE);
 856     switch (tp->kind) {
 857         case fenced_target_by_attribute:
 858             /* This level targets by attribute, so tp->target is a NAME=VALUE pair
 859              * of a permanent attribute applied to targeted nodes. The test below
 860              * relies on the locally cached copy of the CIB, so if fencing needs to
 861              * be done before the initial CIB is received or after a malformed CIB
 862              * is received, then the topology will be unable to be used.
 863              */
 864             if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
 865                 crm_notice("Matched %s with %s by attribute", node, tp->target);
 866                 return TRUE;
 867             }
 868             break;
 869 
 870         case fenced_target_by_pattern:
 871             /* This level targets node names matching a pattern, so tp->target
 872              * (and tp->target_pattern) is a regular expression.
 873              */
 874             if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
 875                 crm_info("Bad regex '%s' for fencing level", tp->target);
 876             } else {
 877                 int status = regexec(&r_patt, node, 0, NULL, 0);
 878 
 879                 regfree(&r_patt);
 880                 if (status == 0) {
 881                     crm_notice("Matched %s with %s by name", node, tp->target);
 882                     return TRUE;
 883                 }
 884             }
 885             break;
 886 
 887         case fenced_target_by_name:
 888             crm_trace("Testing %s against %s", node, tp->target);
 889             return pcmk__str_eq(tp->target, node, pcmk__str_casei);
 890 
 891         default:
 892             break;
 893     }
 894     crm_trace("No match for %s with %s", node, tp->target);
 895     return FALSE;
 896 }
 897 
 898 stonith_topology_t *
 899 find_topology_for_host(const char *host) 
     /* [previous][next][first][last][top][bottom][index][help] */
 900 {
 901     GHashTableIter tIter;
 902     stonith_topology_t *tp = g_hash_table_lookup(topology, host);
 903 
 904     if(tp != NULL) {
 905         crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 906         return tp;
 907     }
 908 
 909     g_hash_table_iter_init(&tIter, topology);
 910     while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
 911         if (topology_matches(tp, host)) {
 912             crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
 913             return tp;
 914         }
 915     }
 916 
 917     crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
 918     return NULL;
 919 }
 920 
 921 /*!
 922  * \internal
 923  * \brief Set fencing operation's device list to target's next topology level
 924  *
 925  * \param[in,out] op        Remote fencing operation to modify
 926  * \param[in]     empty_ok  If true, an operation without a target (i.e.
 927  *                          queries) or a target without a topology will get a
 928  *                          pcmk_rc_ok return value instead of ENODEV
 929  *
 930  * \return Standard Pacemaker return value
 931  */
 932 static int
 933 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
     /* [previous][next][first][last][top][bottom][index][help] */
 934 {
 935     stonith_topology_t *tp = NULL;
 936 
 937     if (op->target) {
 938         tp = find_topology_for_host(op->target);
 939     }
 940     if (topology_is_empty(tp)) {
 941         return empty_ok? pcmk_rc_ok : ENODEV;
 942     }
 943 
 944     CRM_ASSERT(tp->levels != NULL);
 945 
 946     stonith__set_call_options(op->call_options, op->id, st_opt_topology);
 947 
 948     /* This is a new level, so undo any remapping left over from previous */
 949     undo_op_remap(op);
 950 
 951     do {
 952         op->level++;
 953 
 954     } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
 955 
 956     if (op->level < ST_LEVEL_MAX) {
 957         crm_trace("Attempting fencing level %d targeting %s (%d devices) "
 958                   "for client %s@%s (id=%.8s)",
 959                   op->level, op->target, g_list_length(tp->levels[op->level]),
 960                   op->client_name, op->originator, op->id);
 961         set_op_device_list(op, tp->levels[op->level]);
 962 
 963         // The requested delay has been applied for the first fencing level
 964         if (op->level > 1 && op->delay > 0) {
 965             op->delay = 0;
 966         }
 967 
 968         if ((g_list_next(op->devices_list) != NULL)
 969             && pcmk__str_eq(op->action, "reboot", pcmk__str_none)) {
 970             /* A reboot has been requested for a topology level with multiple
 971              * devices. Instead of rebooting the devices sequentially, we will
 972              * turn them all off, then turn them all on again. (Think about
 973              * switched power outlets for redundant power supplies.)
 974              */
 975             op_phase_off(op);
 976         }
 977         return pcmk_rc_ok;
 978     }
 979 
 980     crm_info("All %sfencing options targeting %s for client %s@%s failed "
 981              CRM_XS " id=%.8s",
 982              (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
 983              op->target, op->client_name, op->originator, op->id);
 984     return ENODEV;
 985 }
 986 
 987 /*!
 988  * \internal
 989  * \brief If fencing operation is a duplicate, merge it into the other one
 990  *
 991  * \param[in,out] op  Fencing operation to check
 992  */
 993 static void
 994 merge_duplicates(remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
 995 {
 996     GHashTableIter iter;
 997     remote_fencing_op_t *other = NULL;
 998 
 999     time_t now = time(NULL);
1000 
1001     g_hash_table_iter_init(&iter, stonith_remote_op_list);
1002     while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1003         const char *other_action = op_requested_action(other);
1004 
1005         if (!strcmp(op->id, other->id)) {
1006             continue; // Don't compare against self
1007         }
1008         if (other->state > st_exec) {
1009             crm_trace("%.8s not duplicate of %.8s: not in progress",
1010                       op->id, other->id);
1011             continue;
1012         }
1013         if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1014             crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1015                       op->id, other->id, op->target, other->target);
1016             continue;
1017         }
1018         if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1019             crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1020                       op->id, other->id, op->action, other_action);
1021             continue;
1022         }
1023         if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1024             crm_trace("%.8s not duplicate of %.8s: same client %s",
1025                       op->id, other->id, op->client_name);
1026             continue;
1027         }
1028         if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1029             crm_trace("%.8s not duplicate of %.8s: suicide for %s",
1030                       op->id, other->id, other->target);
1031             continue;
1032         }
1033         if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
1034             crm_notice("Failing action '%s' targeting %s originating from "
1035                        "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1036                        other->action, other->target, other->client_name,
1037                        other->originator, other->id);
1038             crm_trace("%.8s not duplicate of %.8s: originator dead",
1039                       op->id, other->id);
1040             other->state = st_failed;
1041             continue;
1042         }
1043         if ((other->total_timeout > 0)
1044             && (now > (other->total_timeout + other->created))) {
1045             crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
1046                       op->id, other->id, now, other->created,
1047                       other->total_timeout);
1048             continue;
1049         }
1050 
1051         /* There is another in-flight request to fence the same host
1052          * Piggyback on that instead.  If it fails, so do we.
1053          */
1054         other->duplicates = g_list_append(other->duplicates, op);
1055         if (other->total_timeout == 0) {
1056             other->total_timeout = op->total_timeout =
1057                 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1058             crm_trace("Best guess as to timeout used for %.8s: %d",
1059                       other->id, other->total_timeout);
1060         }
1061         crm_notice("Merging fencing action '%s' targeting %s originating from "
1062                    "client %s with identical request from %s@%s "
1063                    CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1064                    op->action, op->target, op->client_name,
1065                    other->client_name, other->originator,
1066                    op->id, other->id, other->total_timeout);
1067         report_timeout_period(op, other->total_timeout);
1068         op->state = st_duplicate;
1069     }
1070 }
1071 
1072 static uint32_t fencing_active_peers(void)
     /* [previous][next][first][last][top][bottom][index][help] */
1073 {
1074     uint32_t count = 0;
1075     crm_node_t *entry;
1076     GHashTableIter gIter;
1077 
1078     g_hash_table_iter_init(&gIter, crm_peer_cache);
1079     while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1080         if(fencing_peer_active(entry)) {
1081             count++;
1082         }
1083     }
1084     return count;
1085 }
1086 
1087 /*!
1088  * \internal
1089  * \brief Process a manual confirmation of a pending fence action
1090  *
1091  * \param[in]     client  IPC client that sent confirmation
1092  * \param[in,out] msg     Request XML with manual confirmation
1093  *
1094  * \return Standard Pacemaker return code
1095  */
1096 int
1097 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
1098 {
1099     remote_fencing_op_t *op = NULL;
1100     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
1101 
1102     CRM_CHECK(dev != NULL, return EPROTO);
1103 
1104     crm_notice("Received manual confirmation that %s has been fenced",
1105                pcmk__s(crm_element_value(dev, F_STONITH_TARGET),
1106                        "unknown target"));
1107     op = initiate_remote_stonith_op(client, msg, TRUE);
1108     if (op == NULL) {
1109         return EPROTO;
1110     }
1111     op->state = st_done;
1112     set_fencing_completed(op);
1113     op->delegate = strdup("a human");
1114 
1115     // For the fencer's purposes, the fencing operation is done
1116     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1117     finalize_op(op, msg, false);
1118 
1119     /* For the requester's purposes, the operation is still pending. The
1120      * actual result will be sent asynchronously via the operation's done_cb().
1121      */
1122     return EINPROGRESS;
1123 }
1124 
1125 /*!
1126  * \internal
1127  * \brief Create a new remote stonith operation
1128  *
1129  * \param[in] client   ID of local stonith client that initiated the operation
1130  * \param[in] request  The request from the client that started the operation
1131  * \param[in] peer     TRUE if this operation is owned by another stonith peer
1132  *                     (an operation owned by one peer is stored on all peers,
1133  *                     but only the owner executes it; all nodes get the results
1134  *                     once the owner finishes execution)
1135  */
1136 void *
1137 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1138 {
1139     remote_fencing_op_t *op = NULL;
1140     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1141     int call_options = 0;
1142     const char *operation = NULL;
1143 
1144     init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1145 
1146     /* If this operation is owned by another node, check to make
1147      * sure we haven't already created this operation. */
1148     if (peer && dev) {
1149         const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1150 
1151         CRM_CHECK(op_id != NULL, return NULL);
1152 
1153         op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1154         if (op) {
1155             crm_debug("Reusing existing remote fencing op %.8s for %s",
1156                       op_id, ((client == NULL)? "unknown client" : client));
1157             return op;
1158         }
1159     }
1160 
1161     op = calloc(1, sizeof(remote_fencing_op_t));
1162     CRM_ASSERT(op != NULL);
1163 
1164     crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1165     // Value -1 means disable any static/random fencing delays
1166     crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1167 
1168     if (peer && dev) {
1169         op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1170     } else {
1171         op->id = crm_generate_uuid();
1172     }
1173 
1174     g_hash_table_replace(stonith_remote_op_list, op->id, op);
1175 
1176     op->state = st_query;
1177     op->replies_expected = fencing_active_peers();
1178     op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1179     op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1180     op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE); /* May not be set */
1181     op->created = time(NULL);
1182 
1183     if (op->originator == NULL) {
1184         /* Local or relayed request */
1185         op->originator = strdup(stonith_our_uname);
1186     }
1187 
1188     CRM_LOG_ASSERT(client != NULL);
1189     if (client) {
1190         op->client_id = strdup(client);
1191     }
1192 
1193 
1194     /* For a RELAY operation, set fenced on the client. */
1195     operation = crm_element_value(request, F_STONITH_OPERATION);
1196 
1197     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1198         op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1199                                          (unsigned long) getpid());
1200     } else {
1201         op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1202     }
1203 
1204     op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1205     op->request = copy_xml(request);    /* TODO: Figure out how to avoid this */
1206     crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1207     op->call_options = call_options;
1208 
1209     crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1210 
1211     crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1212               "base timeout %d, %u %s expected)",
1213               (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1214               op->target, op->client_name, op->base_timeout,
1215               op->replies_expected,
1216               pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1217 
1218     if (op->call_options & st_opt_cs_nodeid) {
1219         int nodeid;
1220         crm_node_t *node;
1221 
1222         pcmk__scan_min_int(op->target, &nodeid, 0);
1223         node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1224 
1225         /* Ensure the conversion only happens once */
1226         stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1227 
1228         if (node && node->uname) {
1229             free(op->target);
1230             op->target = strdup(node->uname);
1231 
1232         } else {
1233             crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1234         }
1235     }
1236 
1237     /* check to see if this is a duplicate operation of another in-flight operation */
1238     merge_duplicates(op);
1239 
1240     if (op->state != st_duplicate) {
1241         /* kick history readers */
1242         fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
1243     }
1244 
1245     /* safe to trim as long as that doesn't touch pending ops */
1246     stonith_fence_history_trim();
1247 
1248     return op;
1249 }
1250 
1251 /*!
1252  * \internal
1253  * \brief Create a peer fencing operation from a request, and initiate it
1254  *
1255  * \param[in] client     IPC client that made request (NULL to get from request)
1256  * \param[in] request    Request XML
1257  * \param[in] manual_ack Whether this is a manual action confirmation
1258  *
1259  * \return Newly created operation on success, otherwise NULL
1260  */
1261 remote_fencing_op_t *
1262 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
     /* [previous][next][first][last][top][bottom][index][help] */
1263                            gboolean manual_ack)
1264 {
1265     int query_timeout = 0;
1266     xmlNode *query = NULL;
1267     const char *client_id = NULL;
1268     remote_fencing_op_t *op = NULL;
1269     const char *relay_op_id = NULL;
1270     const char *operation = NULL;
1271 
1272     if (client) {
1273         client_id = client->id;
1274     } else {
1275         client_id = crm_element_value(request, F_STONITH_CLIENTID);
1276     }
1277 
1278     CRM_LOG_ASSERT(client_id != NULL);
1279     op = create_remote_stonith_op(client_id, request, FALSE);
1280     op->owner = TRUE;
1281     if (manual_ack) {
1282         return op;
1283     }
1284 
1285     CRM_CHECK(op->action, return NULL);
1286 
1287     if (advance_topology_level(op, true) != pcmk_rc_ok) {
1288         op->state = st_failed;
1289     }
1290 
1291     switch (op->state) {
1292         case st_failed:
1293             // advance_topology_level() exhausted levels
1294             pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1295                              "All topology levels failed");
1296             crm_warn("Could not request peer fencing (%s) targeting %s "
1297                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1298             finalize_op(op, NULL, false);
1299             return op;
1300 
1301         case st_duplicate:
1302             crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1303                      CRM_XS " id=%.8s", op->action, op->target, op->id);
1304             return op;
1305 
1306         default:
1307             crm_notice("Requesting peer fencing (%s) targeting %s "
1308                        CRM_XS " id=%.8s state=%s base_timeout=%d",
1309                        op->action, op->target, op->id,
1310                        stonith_op_state_str(op->state), op->base_timeout);
1311     }
1312 
1313     query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1314                               NULL, op->call_options);
1315 
1316     crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1317     crm_xml_add(query, F_STONITH_TARGET, op->target);
1318     crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1319     crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1320     crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1321     crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1322     crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1323 
1324     /* In case of RELAY operation, RELAY information is added to the query to delete the original operation of RELAY. */
1325     operation = crm_element_value(request, F_STONITH_OPERATION);
1326     if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1327         relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1328         if (relay_op_id) {
1329             crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1330         }
1331     }
1332 
1333     send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1334     free_xml(query);
1335 
1336     query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1337     op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1338 
1339     return op;
1340 }
1341 
1342 enum find_best_peer_options {
1343     /*! Skip checking the target peer for capable fencing devices */
1344     FIND_PEER_SKIP_TARGET = 0x0001,
1345     /*! Only check the target peer for capable fencing devices */
1346     FIND_PEER_TARGET_ONLY = 0x0002,
1347     /*! Skip peers and devices that are not verified */
1348     FIND_PEER_VERIFIED_ONLY = 0x0004,
1349 };
1350 
1351 static peer_device_info_t *
1352 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
     /* [previous][next][first][last][top][bottom][index][help] */
1353 {
1354     GList *iter = NULL;
1355     gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1356 
1357     if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1358         return NULL;
1359     }
1360 
1361     for (iter = op->query_results; iter != NULL; iter = iter->next) {
1362         peer_device_info_t *peer = iter->data;
1363 
1364         crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1365                   peer->host, op->target, peer->ndevices,
1366                   pcmk__plural_s(peer->ndevices), peer->tried, options);
1367         if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1368             continue;
1369         }
1370         if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1371             continue;
1372         }
1373 
1374         if (pcmk_is_set(op->call_options, st_opt_topology)) {
1375 
1376             if (grab_peer_device(op, peer, device, verified_devices_only)) {
1377                 return peer;
1378             }
1379 
1380         } else if (!peer->tried
1381                    && count_peer_devices(op, peer, verified_devices_only,
1382                                          fenced_support_flag(op->action))) {
1383             /* No topology: Use the current best peer */
1384             crm_trace("Simple fencing");
1385             return peer;
1386         }
1387     }
1388 
1389     return NULL;
1390 }
1391 
1392 static peer_device_info_t *
1393 stonith_choose_peer(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1394 {
1395     const char *device = NULL;
1396     peer_device_info_t *peer = NULL;
1397     uint32_t active = fencing_active_peers();
1398 
1399     do {
1400         if (op->devices) {
1401             device = op->devices->data;
1402             crm_trace("Checking for someone to fence (%s) %s using %s",
1403                       op->action, op->target, device);
1404         } else {
1405             crm_trace("Checking for someone to fence (%s) %s",
1406                       op->action, op->target);
1407         }
1408 
1409         /* Best choice is a peer other than the target with verified access */
1410         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1411         if (peer) {
1412             crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1413             return peer;
1414         }
1415 
1416         if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1417             crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1418             return NULL;
1419         }
1420 
1421         /* If no other peer has verified access, next best is unverified access */
1422         peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1423         if (peer) {
1424             crm_trace("Found best unverified peer %s", peer->host);
1425             return peer;
1426         }
1427 
1428         /* If no other peer can do it, last option is self-fencing
1429          * (which is never allowed for the "on" phase of a remapped reboot)
1430          */
1431         if (op->phase != st_phase_on) {
1432             peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1433             if (peer) {
1434                 crm_trace("%s will fence itself", peer->host);
1435                 return peer;
1436             }
1437         }
1438 
1439         /* Try the next fencing level if there is one (unless we're in the "on"
1440          * phase of a remapped "reboot", because we ignore errors in that case)
1441          */
1442     } while ((op->phase != st_phase_on)
1443              && pcmk_is_set(op->call_options, st_opt_topology)
1444              && (advance_topology_level(op, false) == pcmk_rc_ok));
1445 
1446     if ((stonith_watchdog_timeout_ms > 0)
1447         && pcmk__is_fencing_action(op->action)
1448         && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
1449         && node_does_watchdog_fencing(op->target)) {
1450         crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1451                  op->target);
1452         /* check_watchdog_fencing_and_wait will log additional info */
1453     } else {
1454         crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1455                    op->action, op->target, (device? device : "any device"));
1456     }
1457     return NULL;
1458 }
1459 
1460 static int
1461 get_device_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1462                    const peer_device_info_t *peer, const char *device,
1463                    bool with_delay)
1464 {
1465     device_properties_t *props;
1466     int delay = 0;
1467 
1468     if (!peer || !device) {
1469         return op->base_timeout;
1470     }
1471 
1472     props = g_hash_table_lookup(peer->devices, device);
1473     if (!props) {
1474         return op->base_timeout;
1475     }
1476 
1477     // op->delay < 0 means disable any static/random fencing delays
1478     if (with_delay && op->delay >= 0) {
1479         // delay_base is eventually limited by delay_max
1480         delay = (props->delay_max[op->phase] > 0 ?
1481                  props->delay_max[op->phase] : props->delay_base[op->phase]);
1482     }
1483 
1484     return (props->custom_action_timeout[op->phase]?
1485             props->custom_action_timeout[op->phase] : op->base_timeout)
1486            + delay;
1487 }
1488 
1489 struct timeout_data {
1490     const remote_fencing_op_t *op;
1491     const peer_device_info_t *peer;
1492     int total_timeout;
1493 };
1494 
1495 /*!
1496  * \internal
1497  * \brief Add timeout to a total if device has not been executed yet
1498  *
1499  * \param[in]     key        GHashTable key (device ID)
1500  * \param[in]     value      GHashTable value (device properties)
1501  * \param[in,out] user_data  Timeout data
1502  */
1503 static void
1504 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1505 {
1506     const char *device_id = key;
1507     device_properties_t *props = value;
1508     struct timeout_data *timeout = user_data;
1509 
1510     if (!props->executed[timeout->op->phase]
1511         && !props->disallowed[timeout->op->phase]) {
1512         timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1513                                                      device_id, true);
1514     }
1515 }
1516 
1517 static int
1518 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1519 {
1520     struct timeout_data timeout;
1521 
1522     timeout.op = op;
1523     timeout.peer = peer;
1524     timeout.total_timeout = 0;
1525 
1526     g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1527 
1528     return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1529 }
1530 
1531 static int
1532 get_op_total_timeout(const remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
1533                      const peer_device_info_t *chosen_peer)
1534 {
1535     int total_timeout = 0;
1536     stonith_topology_t *tp = find_topology_for_host(op->target);
1537 
1538     if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1539         int i;
1540         GList *device_list = NULL;
1541         GList *iter = NULL;
1542         GList *auto_list = NULL;
1543 
1544         if (pcmk__str_eq(op->action, "on", pcmk__str_none)
1545             && (op->automatic_list != NULL)) {
1546             auto_list = g_list_copy(op->automatic_list);
1547         }
1548 
1549         /* Yep, this looks scary, nested loops all over the place.
1550          * Here is what is going on.
1551          * Loop1: Iterate through fencing levels.
1552          * Loop2: If a fencing level has devices, loop through each device
1553          * Loop3: For each device in a fencing level, see what peer owns it
1554          *        and what that peer has reported the timeout is for the device.
1555          */
1556         for (i = 0; i < ST_LEVEL_MAX; i++) {
1557             if (!tp->levels[i]) {
1558                 continue;
1559             }
1560             for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1561                 /* in case of watchdog-device we add the timeout to the budget
1562                    regardless of if we got a reply or not
1563                  */
1564                 if ((stonith_watchdog_timeout_ms > 0)
1565                     && pcmk__is_fencing_action(op->action)
1566                     && pcmk__str_eq(device_list->data, STONITH_WATCHDOG_ID,
1567                                     pcmk__str_none)
1568                     && node_does_watchdog_fencing(op->target)) {
1569                     total_timeout += stonith_watchdog_timeout_ms / 1000;
1570                     continue;
1571                 }
1572 
1573                 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1574                     const peer_device_info_t *peer = iter->data;
1575 
1576                     if (auto_list) {
1577                         GList *match = g_list_find_custom(auto_list, device_list->data,
1578                                         sort_strings);
1579                         if (match) {
1580                             auto_list = g_list_remove(auto_list, match->data);
1581                         }
1582                     }
1583 
1584                     if (find_peer_device(op, peer, device_list->data,
1585                                          fenced_support_flag(op->action))) {
1586                         total_timeout += get_device_timeout(op, peer,
1587                                                             device_list->data,
1588                                                             true);
1589                         break;
1590                     }
1591                 }               /* End Loop3: match device with peer that owns device, find device's timeout period */
1592             }                   /* End Loop2: iterate through devices at a specific level */
1593         }                       /*End Loop1: iterate through fencing levels */
1594 
1595         //Add only exists automatic_list device timeout
1596         if (auto_list) {
1597             for (iter = auto_list; iter != NULL; iter = iter->next) {
1598                 GList *iter2 = NULL;
1599 
1600                 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1601                     peer_device_info_t *peer = iter2->data;
1602                     if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1603                         total_timeout += get_device_timeout(op, peer,
1604                                                             iter->data, true);
1605                         break;
1606                     }
1607                 }
1608             }
1609         }
1610 
1611         g_list_free(auto_list);
1612 
1613     } else if (chosen_peer) {
1614         total_timeout = get_peer_timeout(op, chosen_peer);
1615     } else {
1616         total_timeout = op->base_timeout;
1617     }
1618 
1619     /* Take any requested fencing delay into account to prevent it from eating
1620      * up the total timeout.
1621      */
1622     return ((total_timeout ? total_timeout : op->base_timeout)
1623             + (op->delay > 0 ? op->delay : 0));
1624 }
1625 
1626 static void
1627 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1628 {
1629     GList *iter = NULL;
1630     xmlNode *update = NULL;
1631     const char *client_node = NULL;
1632     const char *client_id = NULL;
1633     const char *call_id = NULL;
1634 
1635     if (op->call_options & st_opt_sync_call) {
1636         /* There is no reason to report the timeout for a synchronous call. It
1637          * is impossible to use the reported timeout to do anything when the client
1638          * is blocking for the response.  This update is only important for
1639          * async calls that require a callback to report the results in. */
1640         return;
1641     } else if (!op->request) {
1642         return;
1643     }
1644 
1645     crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1646     client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1647     call_id = crm_element_value(op->request, F_STONITH_CALLID);
1648     client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1649     if (!client_node || !call_id || !client_id) {
1650         return;
1651     }
1652 
1653     if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1654         // Client is connected to this node, so send update directly to them
1655         do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1656         return;
1657     }
1658 
1659     /* The client is connected to another node, relay this update to them */
1660     update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1661     crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1662     crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1663     crm_xml_add(update, F_STONITH_CALLID, call_id);
1664     crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1665 
1666     send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1667 
1668     free_xml(update);
1669 
1670     for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1671         remote_fencing_op_t *dup = iter->data;
1672 
1673         crm_trace("Reporting timeout for duplicate %.8s to client %s",
1674                   dup->id, dup->client_name);
1675         report_timeout_period(iter->data, op_timeout);
1676     }
1677 }
1678 
1679 /*!
1680  * \internal
1681  * \brief Advance an operation to the next device in its topology
1682  *
1683  * \param[in,out] op      Fencer operation to advance
1684  * \param[in]     device  ID of device that just completed
1685  * \param[in,out] msg     If not NULL, XML reply of last delegated operation
1686  */
1687 static void
1688 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
1689                                  xmlNode *msg)
1690 {
1691     /* Advance to the next device at this topology level, if any */
1692     if (op->devices) {
1693         op->devices = op->devices->next;
1694     }
1695 
1696     /* Handle automatic unfencing if an "on" action was requested */
1697     if ((op->phase == st_phase_requested)
1698         && pcmk__str_eq(op->action, "on", pcmk__str_none)) {
1699         /* If the device we just executed was required, it's not anymore */
1700         remove_required_device(op, device);
1701 
1702         /* If there are no more devices at this topology level, run through any
1703          * remaining devices with automatic unfencing
1704          */
1705         if (op->devices == NULL) {
1706             op->devices = op->automatic_list;
1707         }
1708     }
1709 
1710     if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1711         /* We're done with this level and with required devices, but we had
1712          * remapped "reboot" to "off", so start over with "on". If any devices
1713          * need to be turned back on, op->devices will be non-NULL after this.
1714          */
1715         op_phase_on(op);
1716     }
1717 
1718     // This function is only called if the previous device succeeded
1719     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1720 
1721     if (op->devices) {
1722         /* Necessary devices remain, so execute the next one */
1723         crm_trace("Next targeting %s on behalf of %s@%s",
1724                   op->target, op->client_name, op->originator);
1725 
1726         // The requested delay has been applied for the first device
1727         if (op->delay > 0) {
1728             op->delay = 0;
1729         }
1730 
1731         request_peer_fencing(op, NULL);
1732     } else {
1733         /* We're done with all devices and phases, so finalize operation */
1734         crm_trace("Marking complex fencing op targeting %s as complete",
1735                   op->target);
1736         op->state = st_done;
1737         finalize_op(op, msg, false);
1738     }
1739 }
1740 
1741 static gboolean
1742 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
1743 {
1744     if (node_does_watchdog_fencing(op->target)) {
1745 
1746         crm_notice("Waiting %lds for %s to self-fence (%s) for "
1747                    "client %s " CRM_XS " id=%.8s",
1748                    (stonith_watchdog_timeout_ms / 1000),
1749                    op->target, op->action, op->client_name, op->id);
1750 
1751         if (op->op_timer_one) {
1752             g_source_remove(op->op_timer_one);
1753         }
1754         op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
1755                                          remote_op_watchdog_done, op);
1756         return TRUE;
1757     } else {
1758         crm_debug("Skipping fallback to watchdog-fencing as %s is "
1759                  "not in host-list", op->target);
1760     }
1761     return FALSE;
1762 }
1763 
1764 /*!
1765  * \internal
1766  * \brief Ask a peer to execute a fencing operation
1767  *
1768  * \param[in,out] op      Fencing operation to be executed
1769  * \param[in,out] peer    If NULL or topology is in use, choose best peer to
1770  *                        execute the fencing, otherwise use this peer
1771  */
1772 static void
1773 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
     /* [previous][next][first][last][top][bottom][index][help] */
1774 {
1775     const char *device = NULL;
1776     int timeout;
1777 
1778     CRM_CHECK(op != NULL, return);
1779 
1780     crm_trace("Action %.8s targeting %s for %s is %s",
1781               op->id, op->target, op->client_name,
1782               stonith_op_state_str(op->state));
1783 
1784     if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1785         /* We are in the "on" phase of a remapped topology reboot. If this
1786          * device has pcmk_reboot_action="off", or doesn't support the "on"
1787          * action, skip it.
1788          *
1789          * We can't check device properties at this point because we haven't
1790          * chosen a peer for this stage yet. Instead, we check the local node's
1791          * knowledge about the device. If different versions of the fence agent
1792          * are installed on different nodes, there's a chance this could be
1793          * mistaken, but the worst that could happen is we don't try turning the
1794          * node back on when we should.
1795          */
1796         device = op->devices->data;
1797         if (pcmk__str_eq(fenced_device_reboot_action(device), "off",
1798                          pcmk__str_none)) {
1799             crm_info("Not turning %s back on using %s because the device is "
1800                      "configured to stay off (pcmk_reboot_action='off')",
1801                      op->target, device);
1802             advance_topology_device_in_level(op, device, NULL);
1803             return;
1804         }
1805         if (!fenced_device_supports_on(device)) {
1806             crm_info("Not turning %s back on using %s because the agent "
1807                      "doesn't support 'on'", op->target, device);
1808             advance_topology_device_in_level(op, device, NULL);
1809             return;
1810         }
1811     }
1812 
1813     timeout = op->base_timeout;
1814     if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1815         peer = stonith_choose_peer(op);
1816     }
1817 
1818     if (!op->op_timer_total) {
1819         op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1820         op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1821         report_timeout_period(op, op->total_timeout);
1822         crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1823                  CRM_XS "id=%.8s",
1824                  op->total_timeout, op->target, op->client_name, op->id);
1825     }
1826 
1827     if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1828         /* Ignore the caller's peer preference if topology is in use, because
1829          * that peer might not have access to the required device. With
1830          * topology, stonith_choose_peer() removes the device from further
1831          * consideration, so the timeout must be calculated beforehand.
1832          *
1833          * @TODO Basing the total timeout on the caller's preferred peer (above)
1834          *       is less than ideal.
1835          */
1836         peer = stonith_choose_peer(op);
1837 
1838         device = op->devices->data;
1839         /* Fencing timeout sent to peer takes no delay into account.
1840          * The peer will add a dedicated timer for any delay upon
1841          * schedule_stonith_command().
1842          */
1843         timeout = get_device_timeout(op, peer, device, false);
1844     }
1845 
1846     if (peer) {
1847        /* Take any requested fencing delay into account to prevent it from eating
1848         * up the timeout.
1849         */
1850         int timeout_one = (op->delay > 0 ?
1851                            TIMEOUT_MULTIPLY_FACTOR * op->delay : 0);
1852         xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1853 
1854         crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1855         crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1856         crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1857         crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1858         crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1859         crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1860         crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1861         crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1862         crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1863 
1864         if (device) {
1865             timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1866                            get_device_timeout(op, peer, device, true);
1867             crm_notice("Requesting that %s perform '%s' action targeting %s "
1868                        "using %s " CRM_XS " for client %s (%ds)",
1869                        peer->host, op->action, op->target, device,
1870                        op->client_name, timeout_one);
1871             crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1872 
1873         } else {
1874             timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1875             crm_notice("Requesting that %s perform '%s' action targeting %s "
1876                        CRM_XS " for client %s (%ds, %lds)",
1877                        peer->host, op->action, op->target, op->client_name,
1878                        timeout_one, stonith_watchdog_timeout_ms);
1879         }
1880 
1881         op->state = st_exec;
1882         if (op->op_timer_one) {
1883             g_source_remove(op->op_timer_one);
1884             op->op_timer_one = 0;
1885         }
1886 
1887         if (!((stonith_watchdog_timeout_ms > 0)
1888               && (pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_none)
1889                   || (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1890                       && pcmk__is_fencing_action(op->action)))
1891               && check_watchdog_fencing_and_wait(op))) {
1892 
1893             /* Some thoughts about self-fencing cases reaching this point:
1894                - Actually check in check_watchdog_fencing_and_wait
1895                  shouldn't fail if STONITH_WATCHDOG_ID is
1896                  chosen as fencing-device and it being present implies
1897                  watchdog-fencing is enabled anyway
1898                - If watchdog-fencing is disabled either in general or for
1899                  a specific target - detected in check_watchdog_fencing_and_wait -
1900                  for some other kind of self-fencing we can't expect
1901                  a success answer but timeout is fine if the node doesn't
1902                  come back in between
1903                - Delicate might be the case where we have watchdog-fencing
1904                  enabled for a node but the watchdog-fencing-device isn't
1905                  explicitly chosen for suicide. Local pe-execution in sbd
1906                  may detect the node as unclean and lead to timely suicide.
1907                  Otherwise the selection of stonith-watchdog-timeout at
1908                  least is questionable.
1909              */
1910 
1911             /* coming here we're not waiting for watchdog timeout -
1912                thus engage timer with timout evaluated before */
1913             op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1914         }
1915 
1916         send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1917         peer->tried = TRUE;
1918         free_xml(remote_op);
1919         return;
1920 
1921     } else if (op->phase == st_phase_on) {
1922         /* A remapped "on" cannot be executed, but the node was already
1923          * turned off successfully, so ignore the error and continue.
1924          */
1925         crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1926                  "after successful 'off'", device, op->target);
1927         advance_topology_device_in_level(op, device, NULL);
1928         return;
1929 
1930     } else if (op->owner == FALSE) {
1931         crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1932                 op->action, op->target, op->client_name);
1933 
1934     } else if (op->query_timer == 0) {
1935         /* We've exhausted all available peers */
1936         crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1937                  CRM_XS " state=%s", op->action, op->target, op->client_name,
1938                  stonith_op_state_str(op->state));
1939         CRM_CHECK(op->state < st_done, return);
1940         finalize_timed_out_op(op, "All nodes failed, or are unable, to "
1941                                   "fence target");
1942 
1943     } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1944         /* if the operation never left the query state,
1945          * but we have all the expected replies, then no devices
1946          * are available to execute the fencing operation. */
1947 
1948         if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
1949            STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
1950             if (check_watchdog_fencing_and_wait(op)) {
1951                 return;
1952             }
1953         }
1954 
1955         if (op->state == st_query) {
1956             crm_info("No peers (out of %d) have devices capable of fencing "
1957                      "(%s) %s for client %s " CRM_XS " state=%s",
1958                      op->replies, op->action, op->target, op->client_name,
1959                      stonith_op_state_str(op->state));
1960 
1961             pcmk__reset_result(&op->result);
1962             pcmk__set_result(&op->result, CRM_EX_ERROR,
1963                              PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1964         } else {
1965             if (pcmk_is_set(op->call_options, st_opt_topology)) {
1966                 pcmk__reset_result(&op->result);
1967                 pcmk__set_result(&op->result, CRM_EX_ERROR,
1968                                  PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1969             }
1970             /* ... else use existing result from previous failed attempt
1971              * (topology is not in use, and no devices remain to be attempted).
1972              * Overwriting the result with PCMK_EXEC_NO_FENCE_DEVICE would
1973              * prevent finalize_op() from setting the correct delegate if
1974              * needed.
1975              */
1976 
1977             crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1978                      "for client %s " CRM_XS " state=%s",
1979                      op->replies, op->action, op->target, op->client_name,
1980                      stonith_op_state_str(op->state));
1981         }
1982 
1983         op->state = st_failed;
1984         finalize_op(op, NULL, false);
1985 
1986     } else {
1987         crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1988                  "for client %s " CRM_XS " id=%.8s",
1989                  op->action, op->target, (device? " using " : ""),
1990                  (device? device : ""), op->client_name, op->id);
1991     }
1992 }
1993 
1994 /*!
1995  * \internal
1996  * \brief Comparison function for sorting query results
1997  *
1998  * \param[in] a  GList item to compare
1999  * \param[in] b  GList item to compare
2000  *
2001  * \return Per the glib documentation, "a negative integer if the first value
2002  *         comes before the second, 0 if they are equal, or a positive integer
2003  *         if the first value comes after the second."
2004  */
2005 static gint
2006 sort_peers(gconstpointer a, gconstpointer b)
     /* [previous][next][first][last][top][bottom][index][help] */
2007 {
2008     const peer_device_info_t *peer_a = a;
2009     const peer_device_info_t *peer_b = b;
2010 
2011     return (peer_b->ndevices - peer_a->ndevices);
2012 }
2013 
2014 /*!
2015  * \internal
2016  * \brief Determine if all the devices in the topology are found or not
2017  *
2018  * \param[in] op  Fencing operation with topology to check
2019  */
2020 static gboolean
2021 all_topology_devices_found(const remote_fencing_op_t *op)
     /* [previous][next][first][last][top][bottom][index][help] */
2022 {
2023     GList *device = NULL;
2024     GList *iter = NULL;
2025     device_properties_t *match = NULL;
2026     stonith_topology_t *tp = NULL;
2027     gboolean skip_target = FALSE;
2028     int i;
2029 
2030     tp = find_topology_for_host(op->target);
2031     if (!tp) {
2032         return FALSE;
2033     }
2034     if (pcmk__is_fencing_action(op->action)) {
2035         /* Don't count the devices on the target node if we are killing
2036          * the target node. */
2037         skip_target = TRUE;
2038     }
2039 
2040     for (i = 0; i < ST_LEVEL_MAX; i++) {
2041         for (device = tp->levels[i]; device; device = device->next) {
2042             match = NULL;
2043             for (iter = op->query_results; iter && !match; iter = iter->next) {
2044                 peer_device_info_t *peer = iter->data;
2045 
2046                 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2047                     continue;
2048                 }
2049                 match = find_peer_device(op, peer, device->data, st_device_supports_none);
2050             }
2051             if (!match) {
2052                 return FALSE;
2053             }
2054         }
2055     }
2056 
2057     return TRUE;
2058 }
2059 
2060 /*!
2061  * \internal
2062  * \brief Parse action-specific device properties from XML
2063  *
2064  * \param[in]     xml     XML element containing the properties
2065  * \param[in]     peer    Name of peer that sent XML (for logs)
2066  * \param[in]     device  Device ID (for logs)
2067  * \param[in]     action  Action the properties relate to (for logs)
2068  * \param[in,out] op      Fencing operation that properties are being parsed for
2069  * \param[in]     phase   Phase the properties relate to
2070  * \param[in,out] props   Device properties to update
2071  */
2072 static void
2073 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
     /* [previous][next][first][last][top][bottom][index][help] */
2074                       const char *action, remote_fencing_op_t *op,
2075                       enum st_remap_phase phase, device_properties_t *props)
2076 {
2077     props->custom_action_timeout[phase] = 0;
2078     crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
2079                           &props->custom_action_timeout[phase]);
2080     if (props->custom_action_timeout[phase]) {
2081         crm_trace("Peer %s with device %s returned %s action timeout %d",
2082                   peer, device, action, props->custom_action_timeout[phase]);
2083     }
2084 
2085     props->delay_max[phase] = 0;
2086     crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
2087     if (props->delay_max[phase]) {
2088         crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
2089                   peer, device, props->delay_max[phase], action);
2090     }
2091 
2092     props->delay_base[phase] = 0;
2093     crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
2094     if (props->delay_base[phase]) {
2095         crm_trace("Peer %s with device %s returned base delay %d for %s",
2096                   peer, device, props->delay_base[phase], action);
2097     }
2098 
2099     /* Handle devices with automatic unfencing */
2100     if (pcmk__str_eq(action, "on", pcmk__str_none)) {
2101         int required = 0;
2102 
2103         crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
2104         if (required) {
2105             crm_trace("Peer %s requires device %s to execute for action %s",
2106                       peer, device, action);
2107             add_required_device(op, device);
2108         }
2109     }
2110 
2111     /* If a reboot is remapped to off+on, it's possible that a node is allowed
2112      * to perform one action but not another.
2113      */
2114     if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
2115         props->disallowed[phase] = TRUE;
2116         crm_trace("Peer %s is disallowed from executing %s for device %s",
2117                   peer, action, device);
2118     }
2119 }
2120 
2121 /*!
2122  * \internal
2123  * \brief Parse one device's properties from peer's XML query reply
2124  *
2125  * \param[in]     xml       XML node containing device properties
2126  * \param[in,out] op        Operation that query and reply relate to
2127  * \param[in,out] peer      Peer's device information
2128  * \param[in]     device    ID of device being parsed
2129  */
2130 static void
2131 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
     /* [previous][next][first][last][top][bottom][index][help] */
2132                       peer_device_info_t *peer, const char *device)
2133 {
2134     xmlNode *child;
2135     int verified = 0;
2136     device_properties_t *props = calloc(1, sizeof(device_properties_t));
2137     int flags = st_device_supports_on; /* Old nodes that don't set the flag assume they support the on action */
2138 
2139     /* Add a new entry to this peer's devices list */
2140     CRM_ASSERT(props != NULL);
2141     g_hash_table_insert(peer->devices, strdup(device), props);
2142 
2143     /* Peers with verified (monitored) access will be preferred */
2144     crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
2145     if (verified) {
2146         crm_trace("Peer %s has confirmed a verified device %s",
2147                   peer->host, device);
2148         props->verified = TRUE;
2149     }
2150 
2151     crm_element_value_int(xml, F_STONITH_DEVICE_SUPPORT_FLAGS, &flags);
2152     props->device_support_flags = flags;
2153 
2154     /* Parse action-specific device properties */
2155     parse_action_specific(xml, peer->host, device, op_requested_action(op),
2156                           op, st_phase_requested, props);
2157     for (child = pcmk__xml_first_child(xml); child != NULL;
2158          child = pcmk__xml_next(child)) {
2159         /* Replies for "reboot" operations will include the action-specific
2160          * values for "off" and "on" in child elements, just in case the reboot
2161          * winds up getting remapped.
2162          */
2163         if (pcmk__str_eq(ID(child), "off", pcmk__str_none)) {
2164             parse_action_specific(child, peer->host, device, "off",
2165                                   op, st_phase_off, props);
2166         } else if (pcmk__str_eq(ID(child), "on", pcmk__str_none)) {
2167             parse_action_specific(child, peer->host, device, "on",
2168                                   op, st_phase_on, props);
2169         }
2170     }
2171 }
2172 
2173 /*!
2174  * \internal
2175  * \brief Parse a peer's XML query reply and add it to operation's results
2176  *
2177  * \param[in,out] op        Operation that query and reply relate to
2178  * \param[in]     host      Name of peer that sent this reply
2179  * \param[in]     ndevices  Number of devices expected in reply
2180  * \param[in]     xml       XML node containing device list
2181  *
2182  * \return Newly allocated result structure with parsed reply
2183  */
2184 static peer_device_info_t *
2185 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
     /* [previous][next][first][last][top][bottom][index][help] */
2186            const xmlNode *xml)
2187 {
2188     peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
2189     xmlNode *child;
2190 
2191     // cppcheck seems not to understand the abort logic in CRM_CHECK
2192     // cppcheck-suppress memleak
2193     CRM_CHECK(peer != NULL, return NULL);
2194     peer->host = strdup(host);
2195     peer->devices = pcmk__strkey_table(free, free);
2196 
2197     /* Each child element describes one capable device available to the peer */
2198     for (child = pcmk__xml_first_child(xml); child != NULL;
2199          child = pcmk__xml_next(child)) {
2200         const char *device = ID(child);
2201 
2202         if (device) {
2203             add_device_properties(child, op, peer, device);
2204         }
2205     }
2206 
2207     peer->ndevices = g_hash_table_size(peer->devices);
2208     CRM_CHECK(ndevices == peer->ndevices,
2209               crm_err("Query claimed to have %d device%s but %d found",
2210                       ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2211 
2212     op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2213     return peer;
2214 }
2215 
2216 /*!
2217  * \internal
2218  * \brief Handle a peer's reply to our fencing query
2219  *
2220  * Parse a query result from XML and store it in the remote operation
2221  * table, and when enough replies have been received, issue a fencing request.
2222  *
2223  * \param[in] msg  XML reply received
2224  *
2225  * \return pcmk_ok on success, -errno on error
2226  *
2227  * \note See initiate_remote_stonith_op() for how the XML query was initially
2228  *       formed, and stonith_query() for how the peer formed its XML reply.
2229  */
2230 int
2231 process_remote_stonith_query(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2232 {
2233     int ndevices = 0;
2234     gboolean host_is_target = FALSE;
2235     gboolean have_all_replies = FALSE;
2236     const char *id = NULL;
2237     const char *host = NULL;
2238     remote_fencing_op_t *op = NULL;
2239     peer_device_info_t *peer = NULL;
2240     uint32_t replies_expected;
2241     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2242 
2243     CRM_CHECK(dev != NULL, return -EPROTO);
2244 
2245     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2246     CRM_CHECK(id != NULL, return -EPROTO);
2247 
2248     dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
2249     CRM_CHECK(dev != NULL, return -EPROTO);
2250     crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
2251 
2252     op = g_hash_table_lookup(stonith_remote_op_list, id);
2253     if (op == NULL) {
2254         crm_debug("Received query reply for unknown or expired operation %s",
2255                   id);
2256         return -EOPNOTSUPP;
2257     }
2258 
2259     replies_expected = fencing_active_peers();
2260     if (op->replies_expected < replies_expected) {
2261         replies_expected = op->replies_expected;
2262     }
2263     if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2264         have_all_replies = TRUE;
2265     }
2266     host = crm_element_value(msg, F_ORIG);
2267     host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2268 
2269     crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2270              op->replies, replies_expected, host,
2271              op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2272     if (ndevices > 0) {
2273         peer = add_result(op, host, ndevices, dev);
2274     }
2275 
2276     pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2277 
2278     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2279         /* If we start the fencing before all the topology results are in,
2280          * it is possible fencing levels will be skipped because of the missing
2281          * query results. */
2282         if (op->state == st_query && all_topology_devices_found(op)) {
2283             /* All the query results are in for the topology, start the fencing ops. */
2284             crm_trace("All topology devices found");
2285             request_peer_fencing(op, peer);
2286 
2287         } else if (have_all_replies) {
2288             crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2289                      replies_expected, op->replies);
2290             request_peer_fencing(op, NULL);
2291         }
2292 
2293     } else if (op->state == st_query) {
2294         int nverified = count_peer_devices(op, peer, TRUE,
2295                                            fenced_support_flag(op->action));
2296 
2297         /* We have a result for a non-topology fencing op that looks promising,
2298          * go ahead and start fencing before query timeout */
2299         if ((peer != NULL) && !host_is_target && nverified) {
2300             /* we have a verified device living on a peer that is not the target */
2301             crm_trace("Found %d verified device%s",
2302                       nverified, pcmk__plural_s(nverified));
2303             request_peer_fencing(op, peer);
2304 
2305         } else if (have_all_replies) {
2306             crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2307                      replies_expected, op->replies);
2308             request_peer_fencing(op, NULL);
2309 
2310         } else {
2311             crm_trace("Waiting for more peer results before launching fencing operation");
2312         }
2313 
2314     } else if ((peer != NULL) && (op->state == st_done)) {
2315         crm_info("Discarding query result from %s (%d device%s): "
2316                  "Operation is %s", peer->host,
2317                  peer->ndevices, pcmk__plural_s(peer->ndevices),
2318                  stonith_op_state_str(op->state));
2319     }
2320 
2321     return pcmk_ok;
2322 }
2323 
2324 /*!
2325  * \internal
2326  * \brief Handle a peer's reply to a fencing request
2327  *
2328  * Parse a fencing reply from XML, and either finalize the operation
2329  * or attempt another device as appropriate.
2330  *
2331  * \param[in] msg  XML reply received
2332  */
2333 void
2334 fenced_process_fencing_reply(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
2335 {
2336     const char *id = NULL;
2337     const char *device = NULL;
2338     remote_fencing_op_t *op = NULL;
2339     xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2340     pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2341 
2342     CRM_CHECK(dev != NULL, return);
2343 
2344     id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2345     CRM_CHECK(id != NULL, return);
2346 
2347     dev = stonith__find_xe_with_result(msg);
2348     CRM_CHECK(dev != NULL, return);
2349 
2350     stonith__xe_get_result(dev, &result);
2351 
2352     device = crm_element_value(dev, F_STONITH_DEVICE);
2353 
2354     if (stonith_remote_op_list) {
2355         op = g_hash_table_lookup(stonith_remote_op_list, id);
2356     }
2357 
2358     if ((op == NULL) && pcmk__result_ok(&result)) {
2359         /* Record successful fencing operations */
2360         const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2361 
2362         op = create_remote_stonith_op(client_id, dev, TRUE);
2363     }
2364 
2365     if (op == NULL) {
2366         /* Could be for an event that began before we started */
2367         /* TODO: Record the op for later querying */
2368         crm_info("Received peer result of unknown or expired operation %s", id);
2369         pcmk__reset_result(&result);
2370         return;
2371     }
2372 
2373     pcmk__reset_result(&op->result);
2374     op->result = result; // The operation takes ownership of the result
2375 
2376     if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2377         crm_err("Received outdated reply for device %s (instead of %s) to "
2378                 "fence (%s) %s. Operation already timed out at peer level.",
2379                 device, (const char *) op->devices->data, op->action, op->target);
2380         return;
2381     }
2382 
2383     if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2384         if (pcmk__result_ok(&op->result)) {
2385             op->state = st_done;
2386         } else {
2387             op->state = st_failed;
2388         }
2389         finalize_op(op, msg, false);
2390         return;
2391 
2392     } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2393         /* If this isn't a remote level broadcast, and we are not the
2394          * originator of the operation, we should not be receiving this msg. */
2395         crm_err("Received non-broadcast fencing result for operation %.8s "
2396                 "we do not own (device %s targeting %s)",
2397                 op->id, device, op->target);
2398         return;
2399     }
2400 
2401     if (pcmk_is_set(op->call_options, st_opt_topology)) {
2402         const char *device = NULL;
2403         const char *reason = op->result.exit_reason;
2404 
2405         /* We own the op, and it is complete. broadcast the result to all nodes
2406          * and notify our local clients. */
2407         if (op->state == st_done) {
2408             finalize_op(op, msg, false);
2409             return;
2410         }
2411 
2412         device = crm_element_value(msg, F_STONITH_DEVICE);
2413 
2414         if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2415             /* A remapped "on" failed, but the node was already turned off
2416              * successfully, so ignore the error and continue.
2417              */
2418             crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2419                      "after successful 'off'",
2420                      device, pcmk_exec_status_str(op->result.execution_status),
2421                      (reason == NULL)? "" : ": ",
2422                      (reason == NULL)? "" : reason,
2423                      op->target);
2424             pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2425         } else {
2426             crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2427                        "%s%s%s%s",
2428                        op->action, op->target,
2429                        ((device == NULL)? "" : " using "),
2430                        ((device == NULL)? "" : device),
2431                        op->client_name,
2432                        op->originator,
2433                        pcmk_exec_status_str(op->result.execution_status),
2434                        (reason == NULL)? "" : " (",
2435                        (reason == NULL)? "" : reason,
2436                        (reason == NULL)? "" : ")");
2437         }
2438 
2439         if (pcmk__result_ok(&op->result)) {
2440             /* An operation completed successfully. Try another device if
2441              * necessary, otherwise mark the operation as done. */
2442             advance_topology_device_in_level(op, device, msg);
2443             return;
2444         } else {
2445             /* This device failed, time to try another topology level. If no other
2446              * levels are available, mark this operation as failed and report results. */
2447             if (advance_topology_level(op, false) != pcmk_rc_ok) {
2448                 op->state = st_failed;
2449                 finalize_op(op, msg, false);
2450                 return;
2451             }
2452         }
2453 
2454     } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2455         op->state = st_done;
2456         finalize_op(op, msg, false);
2457         return;
2458 
2459     } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2460                && (op->devices == NULL)) {
2461         /* If the operation timed out don't bother retrying other peers. */
2462         op->state = st_failed;
2463         finalize_op(op, msg, false);
2464         return;
2465 
2466     } else {
2467         /* fall-through and attempt other fencing action using another peer */
2468     }
2469 
2470     /* Retry on failure */
2471     crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2472               op->target, op->originator, op->client_name,
2473               pcmk_exec_status_str(op->result.execution_status));
2474     request_peer_fencing(op, NULL);
2475 }
2476 
2477 gboolean
2478 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
2479 {
2480     GHashTableIter iter;
2481     time_t now = time(NULL);
2482     remote_fencing_op_t *rop = NULL;
2483 
2484     if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2485         action == NULL) {
2486         return FALSE;
2487     }
2488 
2489     g_hash_table_iter_init(&iter, stonith_remote_op_list);
2490     while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2491         if (strcmp(rop->target, target) != 0) {
2492             continue;
2493         } else if (rop->state != st_done) {
2494             continue;
2495         /* We don't have to worry about remapped reboots here
2496          * because if state is done, any remapping has been undone
2497          */
2498         } else if (strcmp(rop->action, action) != 0) {
2499             continue;
2500         } else if ((rop->completed + tolerance) < now) {
2501             continue;
2502         }
2503 
2504         crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2505                    target, action, tolerance, rop->delegate, rop->originator);
2506         return TRUE;
2507     }
2508     return FALSE;
2509 }

/* [previous][next][first][last][top][bottom][index][help] */