pacemaker  1.1.18-7fdfbbe
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
election.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2004-2016 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/time.h>
11 #include <sys/resource.h>
12 
13 #include <crm/msg_xml.h>
14 #include <crm/common/xml.h>
15 
16 #include <crm/common/mainloop.h>
17 #include <crm/cluster/internal.h>
18 #include <crm/cluster/election.h>
19 #include <crm/crm.h>
20 
21 #define STORM_INTERVAL 2 /* in seconds */
22 
23 struct election_s
24 {
25  enum election_result state;
26  guint count;
27  char *name;
28  char *uname;
29  GSourceFunc cb;
30  GHashTable *voted;
31  mainloop_timer_t *timeout; /* When to stop if not everyone casts a vote */
32 };
33 
34 static void election_complete(election_t *e)
35 {
36  crm_info("Election %s complete", e->name);
37  e->state = election_won;
38 
39  if(e->cb) {
40  e->cb(e);
41  }
42 
43  election_reset(e);
44 }
45 
46 static gboolean election_timer_cb(gpointer user_data)
47 {
48  election_t *e = user_data;
49 
50  crm_info("Election %s %p timed out", e->name, e);
51  election_complete(e);
52  return FALSE;
53 }
54 
55 enum election_result
57 {
58  if(e) {
59  return e->state;
60  }
61  return election_error;
62 }
63 
64 election_t *
65 election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
66 {
67  static guint count = 0;
68  election_t *e = calloc(1, sizeof(election_t));
69 
70  if(e != NULL) {
71  if(name) {
72  e->name = crm_strdup_printf("election-%s", name);
73  } else {
74  e->name = crm_strdup_printf("election-%u", count++);
75  }
76 
77  e->cb = cb;
78  e->uname = strdup(uname);
79  e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, election_timer_cb, e);
80  crm_trace("Created %s %p", e->name, e);
81  }
82  return e;
83 }
84 
85 void
87 {
88  if(e && uname && e->voted) {
89  g_hash_table_remove(e->voted, uname);
90  }
91 }
92 
93 void
95 {
96  crm_trace("Resetting election %s", e->name);
97  if(e) {
98  mainloop_timer_stop(e->timeout);
99  }
100  if (e && e->voted) {
101  crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
102  g_hash_table_destroy(e->voted);
103  e->voted = NULL;
104  }
105 }
106 
107 void
109 {
110  if(e) {
111  election_reset(e);
112  crm_trace("Destroying %s", e->name);
113  mainloop_timer_del(e->timeout);
114  free(e->uname);
115  free(e->name);
116  free(e);
117  }
118 }
119 
120 static void
121 election_timeout_start(election_t *e)
122 {
123  if(e) {
124  mainloop_timer_start(e->timeout);
125  }
126 }
127 
128 void
130 {
131  if(e) {
132  mainloop_timer_stop(e->timeout);
133  }
134 }
135 
136 void
138 {
139  if(e) {
140  mainloop_timer_set_period(e->timeout, period);
141  } else {
142  crm_err("No election defined");
143  }
144 }
145 
146 static int
147 crm_uptime(struct timeval *output)
148 {
149  static time_t expires = 0;
150  static struct rusage info;
151 
152  time_t tm_now = time(NULL);
153 
154  if (expires < tm_now) {
155  int rc = 0;
156 
157  info.ru_utime.tv_sec = 0;
158  info.ru_utime.tv_usec = 0;
159  rc = getrusage(RUSAGE_SELF, &info);
160 
161  output->tv_sec = 0;
162  output->tv_usec = 0;
163 
164  if (rc < 0) {
165  crm_perror(LOG_ERR, "Could not calculate the current uptime");
166  expires = 0;
167  return -1;
168  }
169 
170  crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
171  (long)info.ru_utime.tv_usec);
172  }
173 
174  expires = tm_now + STORM_INTERVAL; /* N seconds after the last _access_ */
175  output->tv_sec = info.ru_utime.tv_sec;
176  output->tv_usec = info.ru_utime.tv_usec;
177 
178  return 1;
179 }
180 
181 static int
182 crm_compare_age(struct timeval your_age)
183 {
184  struct timeval our_age;
185 
186  crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
187 
188  if (our_age.tv_sec > your_age.tv_sec) {
189  crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
190  return 1;
191  } else if (our_age.tv_sec < your_age.tv_sec) {
192  crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
193  return -1;
194  } else if (our_age.tv_usec > your_age.tv_usec) {
195  crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
196  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
197  return 1;
198  } else if (our_age.tv_usec < your_age.tv_usec) {
199  crm_debug("Lose: %ld.%ld vs %ld.%ld (usec)",
200  (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
201  return -1;
202  }
203 
204  return 0;
205 }
206 
207 void
209 {
210  struct timeval age;
211  xmlNode *vote = NULL;
212  crm_node_t *our_node;
213 
214  if(e == NULL) {
215  crm_trace("Not voting in election: not initialized");
216  return;
217  }
218 
219  our_node = crm_get_peer(0, e->uname);
220  if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
221  crm_trace("Cannot vote yet: %p", our_node);
222  return;
223  }
224 
225  e->state = election_in_progress;
226  vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
227 
228  e->count++;
229  crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
230  crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
231 
232  crm_uptime(&age);
233  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
234  crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
235 
236  send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
237  free_xml(vote);
238 
239  crm_debug("Started election %d", e->count);
240  if (e->voted) {
241  g_hash_table_destroy(e->voted);
242  e->voted = NULL;
243  }
244 
245  election_timeout_start(e);
246  return;
247 }
248 
249 bool
251 {
252  int voted_size = 0;
253  int num_members = crm_active_peers();
254 
255  if(e == NULL) {
256  crm_trace("not initialized");
257  return FALSE;
258  }
259 
260  if (e->voted) {
261  voted_size = g_hash_table_size(e->voted);
262  }
263  /* in the case of #voted > #members, it is better to
264  * wait for the timeout and give the cluster time to
265  * stabilize
266  */
267  if (voted_size >= num_members) {
268  /* we won and everyone has voted */
270  if (voted_size > num_members) {
271  GHashTableIter gIter;
272  const crm_node_t *node;
273  char *key = NULL;
274 
275  g_hash_table_iter_init(&gIter, crm_peer_cache);
276  while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
277  if (crm_is_peer_active(node)) {
278  crm_err("member: %s proc=%.32x", node->uname, node->processes);
279  }
280  }
281 
282  g_hash_table_iter_init(&gIter, e->voted);
283  while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
284  crm_err("voted: %s", key);
285  }
286 
287  }
288 
289  election_complete(e);
290  return TRUE;
291 
292  } else {
293  crm_debug("Still waiting on %d non-votes (%d total)",
294  num_members - voted_size, num_members);
295  }
296 
297  return FALSE;
298 }
299 
300 #define loss_dampen 2 /* in seconds */
301 
302 /* A_ELECTION_COUNT */
303 enum election_result
304 election_count_vote(election_t *e, xmlNode *vote, bool can_win)
305 {
306  int age = 0;
307  int election_id = -1;
308  int log_level = LOG_INFO;
309  gboolean use_born_on = FALSE;
310  gboolean done = FALSE;
311  gboolean we_lose = FALSE;
312  const char *op = NULL;
313  const char *from = NULL;
314  const char *reason = "unknown";
315  const char *election_owner = NULL;
316  crm_node_t *our_node = NULL, *your_node = NULL;
317 
318  static int election_wins = 0;
319 
320  xmlNode *novote = NULL;
321  time_t tm_now = time(NULL);
322  static time_t expires = 0;
323  static time_t last_election_loss = 0;
324 
325  /* if the membership copy is NULL we REALLY shouldn't be voting
326  * the question is how we managed to get here.
327  */
328 
329  CRM_CHECK(vote != NULL, return election_error);
330 
331  if(e == NULL) {
332  crm_info("Not voting in election: not initialized");
333  return election_lost;
334 
335  } else if(crm_peer_cache == NULL) {
336  crm_info("Not voting in election: no peer cache");
337  return election_lost;
338  }
339 
340  op = crm_element_value(vote, F_CRM_TASK);
341  from = crm_element_value(vote, F_CRM_HOST_FROM);
342  election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER);
343  crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id);
344 
345  your_node = crm_get_peer(0, from);
346  our_node = crm_get_peer(0, e->uname);
347 
348  if (e->voted == NULL) {
349  crm_debug("Created voted hash");
350  e->voted = crm_str_table_new();
351  }
352 
353  if (is_heartbeat_cluster()) {
354  use_born_on = TRUE;
355  } else if (is_classic_ais_cluster()) {
356  use_born_on = TRUE;
357  }
358 
359  if(can_win == FALSE) {
360  reason = "Not eligible";
361  we_lose = TRUE;
362 
363  } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
364  reason = "We are not part of the cluster";
365  log_level = LOG_ERR;
366  we_lose = TRUE;
367 
368  } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) {
369  log_level = LOG_TRACE;
370  reason = "Superseded";
371  done = TRUE;
372 
373  } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
374  /* Possibly we cached the message in the FSA queue at a point that it wasn't */
375  reason = "Peer is not part of our cluster";
376  log_level = LOG_WARNING;
377  done = TRUE;
378 
379  } else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
380  char *op_copy = strdup(op);
381  char *uname_copy = strdup(from);
382 
383  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
384 
385  /* update the list of nodes that have voted */
386  g_hash_table_replace(e->voted, uname_copy, op_copy);
387  reason = "Recorded";
388  done = TRUE;
389 
390  } else {
391  struct timeval your_age;
392  const char *your_version = crm_element_value(vote, F_CRM_VERSION);
393  int tv_sec = 0;
394  int tv_usec = 0;
395 
398 
399  your_age.tv_sec = tv_sec;
400  your_age.tv_usec = tv_usec;
401 
402  age = crm_compare_age(your_age);
403  if (crm_str_eq(from, e->uname, TRUE)) {
404  char *op_copy = strdup(op);
405  char *uname_copy = strdup(from);
406 
407  CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
408 
409  /* update ourselves in the list of nodes that have voted */
410  g_hash_table_replace(e->voted, uname_copy, op_copy);
411  reason = "Recorded";
412  done = TRUE;
413 
414  } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
415  reason = "Version";
416  we_lose = TRUE;
417 
418  } else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
419  reason = "Version";
420 
421  } else if (age < 0) {
422  reason = "Uptime";
423  we_lose = TRUE;
424 
425  } else if (age > 0) {
426  reason = "Uptime";
427 
428  /* TODO: Check for y(our) born < 0 */
429  } else if (use_born_on && your_node->born < our_node->born) {
430  reason = "Born";
431  we_lose = TRUE;
432 
433  } else if (use_born_on && your_node->born > our_node->born) {
434  reason = "Born";
435 
436  } else if (e->uname == NULL) {
437  reason = "Unknown host name";
438  we_lose = TRUE;
439 
440  } else if (strcasecmp(e->uname, from) > 0) {
441  reason = "Host name";
442  we_lose = TRUE;
443 
444  } else {
445  reason = "Host name";
446  CRM_ASSERT(strcasecmp(e->uname, from) < 0);
447 /* can't happen...
448  * } else if(strcasecmp(e->uname, from) == 0) {
449  *
450  */
451  }
452  }
453 
454  if (expires < tm_now) {
455  election_wins = 0;
456  expires = tm_now + STORM_INTERVAL;
457 
458  } else if (done == FALSE && we_lose == FALSE) {
459  int peers = 1 + g_hash_table_size(crm_peer_cache);
460 
461  /* If every node has to vote down every other node, thats N*(N-1) total elections
462  * Allow some leeway before _really_ complaining
463  */
464  election_wins++;
465  if (election_wins > (peers * peers)) {
466  crm_warn("Election storm detected: %d elections in %d seconds", election_wins,
468  election_wins = 0;
469  expires = tm_now + STORM_INTERVAL;
470  crm_write_blackbox(0, NULL);
471  }
472  }
473 
474  if (done) {
475  do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)",
476  election_id, e->count, election_owner, op, from, reason);
477  return e->state;
478 
479  } else if (we_lose == FALSE) {
480  do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)",
481  election_id, election_owner, op, from, reason);
482 
483  if (last_election_loss == 0
484  || tm_now - last_election_loss > (time_t) loss_dampen) {
485 
486  last_election_loss = 0;
488 
489  /* Start a new election by voting down this, and other, peers */
490  e->state = election_start;
491  return e->state;
492  }
493 
494  crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)",
495  election_id, loss_dampen, ctime(&last_election_loss));
496  }
497 
498  novote = create_request(CRM_OP_NOVOTE, NULL, from,
500 
501  do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)",
502  election_id, election_owner, op, from, reason);
503 
505 
506  crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
507  crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
508 
509  send_cluster_message(your_node, crm_msg_crmd, novote, TRUE);
510  free_xml(novote);
511 
512  last_election_loss = tm_now;
513  e->state = election_lost;
514  return e->state;
515 }
#define F_CRM_TASK
Definition: msg_xml.h:56
#define LOG_TRACE
Definition: logging.h:29
bool election_check(election_t *e)
Definition: election.c:250
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:164
void crm_write_blackbox(int nsig, struct qb_log_callsite *callsite)
Definition: logging.c:426
A dumping ground.
enum election_result election_count_vote(election_t *e, xmlNode *vote, bool can_win)
Definition: election.c:304
void mainloop_timer_start(mainloop_timer_t *t)
Definition: mainloop.c:1186
guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms)
Definition: mainloop.c:1204
void mainloop_timer_del(mainloop_timer_t *t)
Definition: mainloop.c:1242
void election_remove(election_t *e, const char *uname)
Definition: election.c:86
gboolean is_heartbeat_cluster(void)
Definition: cluster.c:645
gboolean crm_is_peer_active(const crm_node_t *node)
Definition: membership.c:295
void election_fini(election_t *e)
Definition: election.c:108
uint64_t born
Definition: cluster.h:74
char * uuid
Definition: cluster.h:83
#define STORM_INTERVAL
Definition: election.c:21
#define CRM_FEATURE_SET
Definition: crm.h:36
#define F_CRM_HOST_FROM
Definition: msg_xml.h:61
struct mainloop_timer_s mainloop_timer_t
Definition: mainloop.h:37
crm_node_t * crm_get_peer(unsigned int id, const char *uname)
Definition: membership.c:676
#define CRM_OP_NOVOTE
Definition: crm.h:120
guint crm_active_peers(void)
Definition: membership.c:393
void mainloop_timer_stop(mainloop_timer_t *t)
Definition: mainloop.c:1195
#define F_CRM_ELECTION_AGE_S
Definition: msg_xml.h:69
Wrappers for and extensions to glib mainloop.
void election_vote(election_t *e)
Definition: election.c:208
struct election_s election_t
Definition: election.h:27
char uname[MAX_NAME]
Definition: internal.h:53
#define crm_warn(fmt, args...)
Definition: logging.h:249
uint32_t processes
Definition: cluster.h:79
#define crm_debug(fmt, args...)
Definition: logging.h:253
election_result
Definition: election.h:29
#define crm_trace(fmt, args...)
Definition: logging.h:254
#define do_crm_log(level, fmt, args...)
Log a message.
Definition: logging.h:129
Wrappers for and extensions to libxml2.
int crm_element_value_int(xmlNode *data, const char *name, int *dest)
Definition: xml.c:3844
const char * crm_element_value(xmlNode *data, const char *name)
Definition: xml.c:5165
void free_xml(xmlNode *child)
Definition: xml.c:2706
gboolean crm_str_eq(const char *a, const char *b, gboolean use_case)
Definition: strings.c:213
#define CRM_SYSTEM_CRMD
Definition: crm.h:90
enum election_result election_state(election_t *e)
Definition: election.c:56
#define CRM_OP_VOTE
Definition: crm.h:119
const char * crm_xml_add(xmlNode *node, const char *name, const char *value)
Definition: xml.c:2490
election_t * election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
Definition: election.c:65
const char * crm_xml_add_int(xmlNode *node, const char *name, int value)
Definition: xml.c:2578
#define F_CRM_ELECTION_AGE_US
Definition: msg_xml.h:70
#define loss_dampen
Definition: election.c:300
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
#define crm_err(fmt, args...)
Definition: logging.h:248
void election_timeout_stop(election_t *e)
Definition: election.c:129
void election_reset(election_t *e)
Definition: election.c:94
int compare_version(const char *version1, const char *version2)
Definition: utils.c:486
#define CRM_ASSERT(expr)
Definition: error.h:35
mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata)
Definition: mainloop.c:1221
char * uname
Definition: cluster.h:82
void election_timeout_set_period(election_t *e, guint period_ms)
Definition: election.c:137
#define F_CRM_ELECTION_ID
Definition: msg_xml.h:68
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
gboolean send_cluster_message(crm_node_t *node, enum crm_ais_msg_types service, xmlNode *data, gboolean ordered)
Definition: cluster.c:271
#define create_request(task, xml_data, host_to, sys_to, sys_from, uuid_from)
Definition: ipc.h:34
GHashTable * crm_peer_cache
Definition: membership.c:44
#define crm_info(fmt, args...)
Definition: logging.h:251
#define F_CRM_VERSION
Definition: msg_xml.h:63
gboolean is_classic_ais_cluster(void)
Definition: cluster.c:624
Functions for conducting elections.
#define F_CRM_ELECTION_OWNER
Definition: msg_xml.h:71