pacemaker  1.1.18-7fdfbbe
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
3  * 2014 Andrew Beekhof <andrew@beekhof.net>
4  *
5  * This source code is licensed under the GNU Lesser General Public License
6  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
7  */
8 
9 #include <crm_internal.h>
10 
11 #include <sched.h>
12 #include <sys/ioctl.h>
13 #include <sys/reboot.h>
14 
15 #include <sys/types.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <ctype.h>
19 #include <dirent.h>
20 
21 #ifdef _POSIX_MEMLOCK
22 # include <sys/mman.h>
23 #endif
24 
25 static int sbd_pid = 0;
26 
28 {
33 };
34 
35 #define SYSRQ "/proc/sys/kernel/sysrq"
36 
37 void
39 {
40  static bool need_init = true;
41  FILE* procf;
42  int c;
43 
44  if(need_init) {
45  need_init = false;
46  } else {
47  return;
48  }
49 
50  procf = fopen(SYSRQ, "r");
51  if (!procf) {
52  crm_perror(LOG_ERR, "Cannot open "SYSRQ" for read");
53  return;
54  }
55  if (fscanf(procf, "%d", &c) != 1) {
56  crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
57  c = 0;
58  }
59  fclose(procf);
60  if (c == 1)
61  return;
62 
63  /* 8 for debugging dumps of processes, 128 for reboot/poweroff */
64  c |= 136;
65  procf = fopen(SYSRQ, "w");
66  if (!procf) {
67  crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
68  return;
69  }
70  fprintf(procf, "%d", c);
71  fclose(procf);
72  return;
73 }
74 
75 static void
76 sysrq_trigger(char t)
77 {
78  FILE *procf;
79 
80  sysrq_init();
81 
82  procf = fopen("/proc/sysrq-trigger", "a");
83  if (!procf) {
84  crm_perror(LOG_ERR, "Opening sysrq-trigger failed");
85  return;
86  }
87  crm_info("sysrq-trigger: %c", t);
88  fprintf(procf, "%c\n", t);
89  fclose(procf);
90  return;
91 }
92 
93 
94 static void
95 pcmk_panic_local(void)
96 {
97  int rc = pcmk_ok;
98  uid_t uid = geteuid();
99  pid_t ppid = getppid();
100 
101  if(uid != 0 && ppid > 1) {
102  /* We're a non-root pacemaker daemon (cib, crmd, pengine,
103  * attrd, etc) with the original pacemakerd parent
104  *
105  * Of these, only crmd is likely to be initiating resets
106  */
107  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
109  return;
110 
111  } else if (uid != 0) {
112  /*
113  * No permissions and no pacemakerd parent to escalate to
114  * Track down the new pacakerd process and send a signal instead
115  */
116  union sigval signal_value;
117 
118  memset(&signal_value, 0, sizeof(signal_value));
119  ppid = crm_procfs_pid_of("pacemakerd");
120  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
121 
122  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
123  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
124  }
125  /* The best we can do now is die */
127  return;
128  }
129 
130  /* We're either pacemakerd, or a pacemaker daemon running as root */
131 
132  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
133  sysrq_trigger('c');
134  } else {
135  sysrq_trigger('b');
136  }
137  /* reboot(RB_HALT_SYSTEM); rc = errno; */
138  reboot(RB_AUTOBOOT);
139  rc = errno;
140 
141  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
142 
143  if(ppid > 1) {
144  /* child daemon */
145  exit(pcmk_err_panic);
146  } else {
147  /* pacemakerd or orphan child */
148  exit(DAEMON_RESPAWN_STOP);
149  }
150 }
151 
152 static void
153 pcmk_panic_sbd(void)
154 {
155  union sigval signal_value;
156  pid_t ppid = getppid();
157 
158  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
159 
160  memset(&signal_value, 0, sizeof(signal_value));
161  /* TODO: Arrange for a slightly less brutal option? */
162  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
163  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
164  pcmk_panic_local();
165  }
166 
167  if(ppid > 1) {
168  /* child daemon */
169  exit(pcmk_err_panic);
170  } else {
171  /* pacemakerd or orphan child */
172  exit(DAEMON_RESPAWN_STOP);
173  }
174 }
175 
176 void
177 pcmk_panic(const char *origin)
178 {
179  static struct qb_log_callsite *panic_cs = NULL;
180 
181  if (panic_cs == NULL) {
182  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
183  }
184 
185  /* Ensure sbd_pid is set */
186  (void)pcmk_locate_sbd();
187 
188  if (panic_cs && panic_cs->targets) {
189  /* getppid() == 1 means our original parent no longer exists */
190  do_crm_log_always(LOG_EMERG,
191  "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
192  origin, sbd_pid, getppid());
194  return;
195  }
196 
197  if(sbd_pid > 1) {
198  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
199  pcmk_panic_sbd();
200 
201  } else {
202  do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
203  pcmk_panic_local();
204  }
205 }
206 
207 pid_t
209 {
210  char *pidfile = NULL;
211  char *sbd_path = NULL;
212 
213  if(sbd_pid > 1) {
214  return sbd_pid;
215  }
216 
217  /* Look for the pid file */
218  pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);
219  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
220 
221  /* Read the pid file */
222  CRM_ASSERT(pidfile);
223 
224  sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
225  if(sbd_pid > 0) {
226  crm_trace("SBD detected at pid=%d (file)", sbd_pid);
227 
228  } else {
229  /* Fall back to /proc for systems that support it */
230  sbd_pid = crm_procfs_pid_of("sbd");
231  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
232  }
233 
234  if(sbd_pid < 0) {
235  sbd_pid = 0;
236  crm_trace("SBD not detected");
237  }
238 
239  free(pidfile);
240  free(sbd_path);
241 
242  return sbd_pid;
243 }
244 
245 long
247 {
248  const char *env_value = getenv("SBD_WATCHDOG_TIMEOUT");
249  long sbd_timeout = crm_get_msec(env_value);
250 
251  return sbd_timeout;
252 }
253 
254 gboolean
255 check_sbd_timeout(const char *value)
256 {
257  long st_timeout = value? crm_get_msec(value) : 0;
258 
259  if (st_timeout <= 0) {
260  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
261  value? value : "default");
262 
263  } else if (pcmk_locate_sbd() == 0) {
264  do_crm_log_always(LOG_EMERG,
265  "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
266  value);
268  return FALSE;
269 
270  } else {
271  long sbd_timeout = crm_get_sbd_timeout();
272 
273  if (st_timeout < sbd_timeout) {
274  do_crm_log_always(LOG_EMERG,
275  "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
276  value, sbd_timeout);
278  return FALSE;
279  }
280  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
281  value, sbd_timeout);
282  }
283  return TRUE;
284 }
#define LOG_TRACE
Definition: logging.h:29
long crm_get_sbd_timeout(void)
Definition: watchdog.c:246
gboolean check_sbd_timeout(const char *value)
Definition: watchdog.c:255
long crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: utils.c:818
const char * pcmk_strerror(int rc)
Definition: logging.c:1135
#define pcmk_ok
Definition: error.h:42
long long crm_get_msec(const char *input)
Definition: utils.c:598
unsigned int crm_trace_nonlog
Definition: logging.c:48
pcmk_panic_flags
Definition: watchdog.c:27
#define crm_debug(fmt, args...)
Definition: logging.h:253
#define crm_trace(fmt, args...)
Definition: logging.h:254
void pcmk_panic(const char *origin)
Definition: watchdog.c:177
#define HA_STATE_DIR
Definition: config.h:575
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:213
#define pcmk_err_panic
Definition: error.h:57
#define DAEMON_RESPAWN_STOP
Definition: crm.h:65
#define SBIN_DIR
Definition: config.h:686
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:226
#define CRM_ASSERT(expr)
Definition: error.h:35
int crm_exit(int rc)
Definition: utils.c:83
#define SYSRQ
Definition: watchdog.c:35
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:208
#define safe_str_eq(a, b)
Definition: util.h:72
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:118
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
#define crm_info(fmt, args...)
Definition: logging.h:251
void sysrq_init(void)
Definition: watchdog.c:38