1 | /*
|
---|
2 | ctdb recovery daemon
|
---|
3 |
|
---|
4 | Copyright (C) Ronnie Sahlberg 2007
|
---|
5 |
|
---|
6 | This program is free software; you can redistribute it and/or modify
|
---|
7 | it under the terms of the GNU General Public License as published by
|
---|
8 | the Free Software Foundation; either version 3 of the License, or
|
---|
9 | (at your option) any later version.
|
---|
10 |
|
---|
11 | This program is distributed in the hope that it will be useful,
|
---|
12 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | GNU General Public License for more details.
|
---|
15 |
|
---|
16 | You should have received a copy of the GNU General Public License
|
---|
17 | along with this program; if not, see <http://www.gnu.org/licenses/>.
|
---|
18 | */
|
---|
19 |
|
---|
20 | #include "replace.h"
|
---|
21 | #include "system/filesys.h"
|
---|
22 | #include "system/time.h"
|
---|
23 | #include "system/network.h"
|
---|
24 | #include "system/wait.h"
|
---|
25 |
|
---|
26 | #include <popt.h>
|
---|
27 | #include <talloc.h>
|
---|
28 | #include <tevent.h>
|
---|
29 | #include <tdb.h>
|
---|
30 |
|
---|
31 | #include "lib/tdb_wrap/tdb_wrap.h"
|
---|
32 | #include "lib/util/dlinklist.h"
|
---|
33 | #include "lib/util/debug.h"
|
---|
34 | #include "lib/util/samba_util.h"
|
---|
35 | #include "lib/util/util_process.h"
|
---|
36 |
|
---|
37 | #include "ctdb_private.h"
|
---|
38 | #include "ctdb_client.h"
|
---|
39 |
|
---|
40 | #include "common/system.h"
|
---|
41 | #include "common/cmdline.h"
|
---|
42 | #include "common/common.h"
|
---|
43 | #include "common/logging.h"
|
---|
44 |
|
---|
45 |
|
---|
46 | /* List of SRVID requests that need to be processed */
|
---|
47 | struct srvid_list {
|
---|
48 | struct srvid_list *next, *prev;
|
---|
49 | struct ctdb_srvid_message *request;
|
---|
50 | };
|
---|
51 |
|
---|
52 | struct srvid_requests {
|
---|
53 | struct srvid_list *requests;
|
---|
54 | };
|
---|
55 |
|
---|
56 | static void srvid_request_reply(struct ctdb_context *ctdb,
|
---|
57 | struct ctdb_srvid_message *request,
|
---|
58 | TDB_DATA result)
|
---|
59 | {
|
---|
60 | /* Someone that sent srvid==0 does not want a reply */
|
---|
61 | if (request->srvid == 0) {
|
---|
62 | talloc_free(request);
|
---|
63 | return;
|
---|
64 | }
|
---|
65 |
|
---|
66 | if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
|
---|
67 | result) == 0) {
|
---|
68 | DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
|
---|
69 | (unsigned)request->pnn,
|
---|
70 | (unsigned long long)request->srvid));
|
---|
71 | } else {
|
---|
72 | DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
|
---|
73 | (unsigned)request->pnn,
|
---|
74 | (unsigned long long)request->srvid));
|
---|
75 | }
|
---|
76 |
|
---|
77 | talloc_free(request);
|
---|
78 | }
|
---|
79 |
|
---|
80 | static void srvid_requests_reply(struct ctdb_context *ctdb,
|
---|
81 | struct srvid_requests **requests,
|
---|
82 | TDB_DATA result)
|
---|
83 | {
|
---|
84 | struct srvid_list *r;
|
---|
85 |
|
---|
86 | for (r = (*requests)->requests; r != NULL; r = r->next) {
|
---|
87 | srvid_request_reply(ctdb, r->request, result);
|
---|
88 | }
|
---|
89 |
|
---|
90 | /* Free the list structure... */
|
---|
91 | TALLOC_FREE(*requests);
|
---|
92 | }
|
---|
93 |
|
---|
94 | static void srvid_request_add(struct ctdb_context *ctdb,
|
---|
95 | struct srvid_requests **requests,
|
---|
96 | struct ctdb_srvid_message *request)
|
---|
97 | {
|
---|
98 | struct srvid_list *t;
|
---|
99 | int32_t ret;
|
---|
100 | TDB_DATA result;
|
---|
101 |
|
---|
102 | if (*requests == NULL) {
|
---|
103 | *requests = talloc_zero(ctdb, struct srvid_requests);
|
---|
104 | if (*requests == NULL) {
|
---|
105 | goto nomem;
|
---|
106 | }
|
---|
107 | }
|
---|
108 |
|
---|
109 | t = talloc_zero(*requests, struct srvid_list);
|
---|
110 | if (t == NULL) {
|
---|
111 | /* If *requests was just allocated above then free it */
|
---|
112 | if ((*requests)->requests == NULL) {
|
---|
113 | TALLOC_FREE(*requests);
|
---|
114 | }
|
---|
115 | goto nomem;
|
---|
116 | }
|
---|
117 |
|
---|
118 | t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
|
---|
119 | DLIST_ADD((*requests)->requests, t);
|
---|
120 |
|
---|
121 | return;
|
---|
122 |
|
---|
123 | nomem:
|
---|
124 | /* Failed to add the request to the list. Send a fail. */
|
---|
125 | DEBUG(DEBUG_ERR, (__location__
|
---|
126 | " Out of memory, failed to queue SRVID request\n"));
|
---|
127 | ret = -ENOMEM;
|
---|
128 | result.dsize = sizeof(ret);
|
---|
129 | result.dptr = (uint8_t *)&ret;
|
---|
130 | srvid_request_reply(ctdb, request, result);
|
---|
131 | }
|
---|
132 |
|
---|
133 | /* An abstraction to allow an operation (takeover runs, recoveries,
|
---|
134 | * ...) to be disabled for a given timeout */
|
---|
135 | struct ctdb_op_state {
|
---|
136 | struct tevent_timer *timer;
|
---|
137 | bool in_progress;
|
---|
138 | const char *name;
|
---|
139 | };
|
---|
140 |
|
---|
141 | static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
|
---|
142 | {
|
---|
143 | struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
|
---|
144 |
|
---|
145 | if (state != NULL) {
|
---|
146 | state->in_progress = false;
|
---|
147 | state->name = name;
|
---|
148 | }
|
---|
149 |
|
---|
150 | return state;
|
---|
151 | }
|
---|
152 |
|
---|
153 | static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
|
---|
154 | {
|
---|
155 | return state->timer != NULL;
|
---|
156 | }
|
---|
157 |
|
---|
158 | static bool ctdb_op_begin(struct ctdb_op_state *state)
|
---|
159 | {
|
---|
160 | if (ctdb_op_is_disabled(state)) {
|
---|
161 | DEBUG(DEBUG_NOTICE,
|
---|
162 | ("Unable to begin - %s are disabled\n", state->name));
|
---|
163 | return false;
|
---|
164 | }
|
---|
165 |
|
---|
166 | state->in_progress = true;
|
---|
167 | return true;
|
---|
168 | }
|
---|
169 |
|
---|
170 | static bool ctdb_op_end(struct ctdb_op_state *state)
|
---|
171 | {
|
---|
172 | return state->in_progress = false;
|
---|
173 | }
|
---|
174 |
|
---|
175 | static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
|
---|
176 | {
|
---|
177 | return state->in_progress;
|
---|
178 | }
|
---|
179 |
|
---|
180 | static void ctdb_op_enable(struct ctdb_op_state *state)
|
---|
181 | {
|
---|
182 | TALLOC_FREE(state->timer);
|
---|
183 | }
|
---|
184 |
|
---|
185 | static void ctdb_op_timeout_handler(struct tevent_context *ev,
|
---|
186 | struct tevent_timer *te,
|
---|
187 | struct timeval yt, void *p)
|
---|
188 | {
|
---|
189 | struct ctdb_op_state *state =
|
---|
190 | talloc_get_type(p, struct ctdb_op_state);
|
---|
191 |
|
---|
192 | DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
|
---|
193 | ctdb_op_enable(state);
|
---|
194 | }
|
---|
195 |
|
---|
196 | static int ctdb_op_disable(struct ctdb_op_state *state,
|
---|
197 | struct tevent_context *ev,
|
---|
198 | uint32_t timeout)
|
---|
199 | {
|
---|
200 | if (timeout == 0) {
|
---|
201 | DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
|
---|
202 | ctdb_op_enable(state);
|
---|
203 | return 0;
|
---|
204 | }
|
---|
205 |
|
---|
206 | if (state->in_progress) {
|
---|
207 | DEBUG(DEBUG_ERR,
|
---|
208 | ("Unable to disable %s - in progress\n", state->name));
|
---|
209 | return -EAGAIN;
|
---|
210 | }
|
---|
211 |
|
---|
212 | DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
|
---|
213 | state->name, timeout));
|
---|
214 |
|
---|
215 | /* Clear any old timers */
|
---|
216 | talloc_free(state->timer);
|
---|
217 |
|
---|
218 | /* Arrange for the timeout to occur */
|
---|
219 | state->timer = tevent_add_timer(ev, state,
|
---|
220 | timeval_current_ofs(timeout, 0),
|
---|
221 | ctdb_op_timeout_handler, state);
|
---|
222 | if (state->timer == NULL) {
|
---|
223 | DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
|
---|
224 | return -ENOMEM;
|
---|
225 | }
|
---|
226 |
|
---|
227 | return 0;
|
---|
228 | }
|
---|
229 |
|
---|
230 | struct ctdb_banning_state {
|
---|
231 | uint32_t count;
|
---|
232 | struct timeval last_reported_time;
|
---|
233 | };
|
---|
234 |
|
---|
235 | /*
|
---|
236 | private state of recovery daemon
|
---|
237 | */
|
---|
238 | struct ctdb_recoverd {
|
---|
239 | struct ctdb_context *ctdb;
|
---|
240 | uint32_t recmaster;
|
---|
241 | uint32_t last_culprit_node;
|
---|
242 | struct ctdb_node_map_old *nodemap;
|
---|
243 | struct timeval priority_time;
|
---|
244 | bool need_takeover_run;
|
---|
245 | bool need_recovery;
|
---|
246 | uint32_t node_flags;
|
---|
247 | struct tevent_timer *send_election_te;
|
---|
248 | struct tevent_timer *election_timeout;
|
---|
249 | struct srvid_requests *reallocate_requests;
|
---|
250 | struct ctdb_op_state *takeover_run;
|
---|
251 | struct ctdb_op_state *recovery;
|
---|
252 | struct ctdb_iface_list_old *ifaces;
|
---|
253 | uint32_t *force_rebalance_nodes;
|
---|
254 | struct ctdb_node_capabilities *caps;
|
---|
255 | bool frozen_on_inactive;
|
---|
256 | };
|
---|
257 |
|
---|
258 | #define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
|
---|
259 | #define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
|
---|
260 |
|
---|
261 | static void ctdb_restart_recd(struct tevent_context *ev,
|
---|
262 | struct tevent_timer *te, struct timeval t,
|
---|
263 | void *private_data);
|
---|
264 |
|
---|
265 | /*
|
---|
266 | ban a node for a period of time
|
---|
267 | */
|
---|
268 | static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
|
---|
269 | {
|
---|
270 | int ret;
|
---|
271 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
272 | struct ctdb_ban_state bantime;
|
---|
273 |
|
---|
274 | if (!ctdb_validate_pnn(ctdb, pnn)) {
|
---|
275 | DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
|
---|
276 | return;
|
---|
277 | }
|
---|
278 |
|
---|
279 | DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
|
---|
280 |
|
---|
281 | bantime.pnn = pnn;
|
---|
282 | bantime.time = ban_time;
|
---|
283 |
|
---|
284 | ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
|
---|
285 | if (ret != 0) {
|
---|
286 | DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
|
---|
287 | return;
|
---|
288 | }
|
---|
289 |
|
---|
290 | }
|
---|
291 |
|
---|
292 | enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
|
---|
293 |
|
---|
294 |
|
---|
295 | /*
|
---|
296 | remember the trouble maker
|
---|
297 | */
|
---|
298 | static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
|
---|
299 | {
|
---|
300 | struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
|
---|
301 | struct ctdb_banning_state *ban_state;
|
---|
302 |
|
---|
303 | if (culprit > ctdb->num_nodes) {
|
---|
304 | DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
|
---|
305 | return;
|
---|
306 | }
|
---|
307 |
|
---|
308 | /* If we are banned or stopped, do not set other nodes as culprits */
|
---|
309 | if (rec->node_flags & NODE_FLAGS_INACTIVE) {
|
---|
310 | DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
|
---|
311 | return;
|
---|
312 | }
|
---|
313 |
|
---|
314 | if (ctdb->nodes[culprit]->ban_state == NULL) {
|
---|
315 | ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
|
---|
316 | CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
|
---|
317 |
|
---|
318 |
|
---|
319 | }
|
---|
320 | ban_state = ctdb->nodes[culprit]->ban_state;
|
---|
321 | if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
|
---|
322 | /* this was the first time in a long while this node
|
---|
323 | misbehaved so we will forgive any old transgressions.
|
---|
324 | */
|
---|
325 | ban_state->count = 0;
|
---|
326 | }
|
---|
327 |
|
---|
328 | ban_state->count += count;
|
---|
329 | ban_state->last_reported_time = timeval_current();
|
---|
330 | rec->last_culprit_node = culprit;
|
---|
331 | }
|
---|
332 |
|
---|
333 | /*
|
---|
334 | remember the trouble maker
|
---|
335 | */
|
---|
336 | static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
|
---|
337 | {
|
---|
338 | ctdb_set_culprit_count(rec, culprit, 1);
|
---|
339 | }
|
---|
340 |
|
---|
341 |
|
---|
342 | /* this callback is called for every node that failed to execute the
|
---|
343 | recovered event
|
---|
344 | */
|
---|
345 | static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
346 | {
|
---|
347 | struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
|
---|
348 |
|
---|
349 | DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
|
---|
350 |
|
---|
351 | ctdb_set_culprit(rec, node_pnn);
|
---|
352 | }
|
---|
353 |
|
---|
354 | /*
|
---|
355 | run the "recovered" eventscript on all nodes
|
---|
356 | */
|
---|
357 | static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
|
---|
358 | {
|
---|
359 | TALLOC_CTX *tmp_ctx;
|
---|
360 | uint32_t *nodes;
|
---|
361 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
362 |
|
---|
363 | tmp_ctx = talloc_new(ctdb);
|
---|
364 | CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
---|
365 |
|
---|
366 | nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
367 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
|
---|
368 | nodes, 0,
|
---|
369 | CONTROL_TIMEOUT(), false, tdb_null,
|
---|
370 | NULL, recovered_fail_callback,
|
---|
371 | rec) != 0) {
|
---|
372 | DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
|
---|
373 |
|
---|
374 | talloc_free(tmp_ctx);
|
---|
375 | return -1;
|
---|
376 | }
|
---|
377 |
|
---|
378 | talloc_free(tmp_ctx);
|
---|
379 | return 0;
|
---|
380 | }
|
---|
381 |
|
---|
382 | /* this callback is called for every node that failed to execute the
|
---|
383 | start recovery event
|
---|
384 | */
|
---|
385 | static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
386 | {
|
---|
387 | struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
|
---|
388 |
|
---|
389 | DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
|
---|
390 |
|
---|
391 | ctdb_set_culprit(rec, node_pnn);
|
---|
392 | }
|
---|
393 |
|
---|
394 | /*
|
---|
395 | run the "startrecovery" eventscript on all nodes
|
---|
396 | */
|
---|
397 | static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
|
---|
398 | {
|
---|
399 | TALLOC_CTX *tmp_ctx;
|
---|
400 | uint32_t *nodes;
|
---|
401 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
402 |
|
---|
403 | tmp_ctx = talloc_new(ctdb);
|
---|
404 | CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
---|
405 |
|
---|
406 | nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
407 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
|
---|
408 | nodes, 0,
|
---|
409 | CONTROL_TIMEOUT(), false, tdb_null,
|
---|
410 | NULL,
|
---|
411 | startrecovery_fail_callback,
|
---|
412 | rec) != 0) {
|
---|
413 | DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
|
---|
414 | talloc_free(tmp_ctx);
|
---|
415 | return -1;
|
---|
416 | }
|
---|
417 |
|
---|
418 | talloc_free(tmp_ctx);
|
---|
419 | return 0;
|
---|
420 | }
|
---|
421 |
|
---|
422 | /*
|
---|
423 | Retrieve capabilities from all connected nodes
|
---|
424 | */
|
---|
425 | static int update_capabilities(struct ctdb_recoverd *rec,
|
---|
426 | struct ctdb_node_map_old *nodemap)
|
---|
427 | {
|
---|
428 | uint32_t *capp;
|
---|
429 | TALLOC_CTX *tmp_ctx;
|
---|
430 | struct ctdb_node_capabilities *caps;
|
---|
431 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
432 |
|
---|
433 | tmp_ctx = talloc_new(rec);
|
---|
434 | CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
---|
435 |
|
---|
436 | caps = ctdb_get_capabilities(ctdb, tmp_ctx,
|
---|
437 | CONTROL_TIMEOUT(), nodemap);
|
---|
438 |
|
---|
439 | if (caps == NULL) {
|
---|
440 | DEBUG(DEBUG_ERR,
|
---|
441 | (__location__ " Failed to get node capabilities\n"));
|
---|
442 | talloc_free(tmp_ctx);
|
---|
443 | return -1;
|
---|
444 | }
|
---|
445 |
|
---|
446 | capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
|
---|
447 | if (capp == NULL) {
|
---|
448 | DEBUG(DEBUG_ERR,
|
---|
449 | (__location__
|
---|
450 | " Capabilities don't include current node.\n"));
|
---|
451 | talloc_free(tmp_ctx);
|
---|
452 | return -1;
|
---|
453 | }
|
---|
454 | ctdb->capabilities = *capp;
|
---|
455 |
|
---|
456 | TALLOC_FREE(rec->caps);
|
---|
457 | rec->caps = talloc_steal(rec, caps);
|
---|
458 |
|
---|
459 | talloc_free(tmp_ctx);
|
---|
460 | return 0;
|
---|
461 | }
|
---|
462 |
|
---|
463 | static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
464 | {
|
---|
465 | struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
|
---|
466 |
|
---|
467 | DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
|
---|
468 | ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
|
---|
469 | }
|
---|
470 |
|
---|
471 | static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
472 | {
|
---|
473 | struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
|
---|
474 |
|
---|
475 | DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
|
---|
476 | ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
|
---|
477 | }
|
---|
478 |
|
---|
479 | /*
|
---|
480 | change recovery mode on all nodes
|
---|
481 | */
|
---|
482 | static int set_recovery_mode(struct ctdb_context *ctdb,
|
---|
483 | struct ctdb_recoverd *rec,
|
---|
484 | struct ctdb_node_map_old *nodemap,
|
---|
485 | uint32_t rec_mode, bool freeze)
|
---|
486 | {
|
---|
487 | TDB_DATA data;
|
---|
488 | uint32_t *nodes;
|
---|
489 | TALLOC_CTX *tmp_ctx;
|
---|
490 |
|
---|
491 | tmp_ctx = talloc_new(ctdb);
|
---|
492 | CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
---|
493 |
|
---|
494 | nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
495 |
|
---|
496 | data.dsize = sizeof(uint32_t);
|
---|
497 | data.dptr = (unsigned char *)&rec_mode;
|
---|
498 |
|
---|
499 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
|
---|
500 | nodes, 0,
|
---|
501 | CONTROL_TIMEOUT(),
|
---|
502 | false, data,
|
---|
503 | NULL, NULL,
|
---|
504 | NULL) != 0) {
|
---|
505 | DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
|
---|
506 | talloc_free(tmp_ctx);
|
---|
507 | return -1;
|
---|
508 | }
|
---|
509 |
|
---|
510 | /* freeze all nodes */
|
---|
511 | if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
|
---|
512 | int i;
|
---|
513 |
|
---|
514 | for (i=1; i<=NUM_DB_PRIORITIES; i++) {
|
---|
515 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
|
---|
516 | nodes, i,
|
---|
517 | CONTROL_TIMEOUT(),
|
---|
518 | false, tdb_null,
|
---|
519 | NULL,
|
---|
520 | set_recmode_fail_callback,
|
---|
521 | rec) != 0) {
|
---|
522 | DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
|
---|
523 | talloc_free(tmp_ctx);
|
---|
524 | return -1;
|
---|
525 | }
|
---|
526 | }
|
---|
527 | }
|
---|
528 |
|
---|
529 | talloc_free(tmp_ctx);
|
---|
530 | return 0;
|
---|
531 | }
|
---|
532 |
|
---|
533 | /* update all remote nodes to use the same db priority that we have
|
---|
534 | this can fail if the remove node has not yet been upgraded to
|
---|
535 | support this function, so we always return success and never fail
|
---|
536 | a recovery if this call fails.
|
---|
537 | */
|
---|
538 | static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
|
---|
539 | struct ctdb_node_map_old *nodemap,
|
---|
540 | uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
|
---|
541 | {
|
---|
542 | int db;
|
---|
543 |
|
---|
544 | /* step through all local databases */
|
---|
545 | for (db=0; db<dbmap->num;db++) {
|
---|
546 | struct ctdb_db_priority db_prio;
|
---|
547 | int ret;
|
---|
548 |
|
---|
549 | db_prio.db_id = dbmap->dbs[db].db_id;
|
---|
550 | ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
|
---|
551 | if (ret != 0) {
|
---|
552 | DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
|
---|
553 | continue;
|
---|
554 | }
|
---|
555 |
|
---|
556 | DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
|
---|
557 |
|
---|
558 | ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
|
---|
559 | CTDB_CURRENT_NODE, &db_prio);
|
---|
560 | if (ret != 0) {
|
---|
561 | DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
|
---|
562 | db_prio.db_id));
|
---|
563 | }
|
---|
564 | }
|
---|
565 |
|
---|
566 | return 0;
|
---|
567 | }
|
---|
568 |
|
---|
569 | /*
|
---|
570 | ensure all other nodes have attached to any databases that we have
|
---|
571 | */
|
---|
572 | static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
|
---|
573 | uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
|
---|
574 | {
|
---|
575 | int i, j, db, ret;
|
---|
576 | struct ctdb_dbid_map_old *remote_dbmap;
|
---|
577 |
|
---|
578 | /* verify that all other nodes have all our databases */
|
---|
579 | for (j=0; j<nodemap->num; j++) {
|
---|
580 | /* we don't need to ourself ourselves */
|
---|
581 | if (nodemap->nodes[j].pnn == pnn) {
|
---|
582 | continue;
|
---|
583 | }
|
---|
584 | /* don't check nodes that are unavailable */
|
---|
585 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
586 | continue;
|
---|
587 | }
|
---|
588 |
|
---|
589 | ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
|
---|
590 | mem_ctx, &remote_dbmap);
|
---|
591 | if (ret != 0) {
|
---|
592 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
|
---|
593 | return -1;
|
---|
594 | }
|
---|
595 |
|
---|
596 | /* step through all local databases */
|
---|
597 | for (db=0; db<dbmap->num;db++) {
|
---|
598 | const char *name;
|
---|
599 |
|
---|
600 |
|
---|
601 | for (i=0;i<remote_dbmap->num;i++) {
|
---|
602 | if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
|
---|
603 | break;
|
---|
604 | }
|
---|
605 | }
|
---|
606 | /* the remote node already have this database */
|
---|
607 | if (i!=remote_dbmap->num) {
|
---|
608 | continue;
|
---|
609 | }
|
---|
610 | /* ok so we need to create this database */
|
---|
611 | ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
|
---|
612 | dbmap->dbs[db].db_id, mem_ctx,
|
---|
613 | &name);
|
---|
614 | if (ret != 0) {
|
---|
615 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
|
---|
616 | return -1;
|
---|
617 | }
|
---|
618 | ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
|
---|
619 | nodemap->nodes[j].pnn,
|
---|
620 | mem_ctx, name,
|
---|
621 | dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
|
---|
622 | if (ret != 0) {
|
---|
623 | DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
|
---|
624 | return -1;
|
---|
625 | }
|
---|
626 | }
|
---|
627 | }
|
---|
628 |
|
---|
629 | return 0;
|
---|
630 | }
|
---|
631 |
|
---|
632 |
|
---|
633 | /*
|
---|
634 | ensure we are attached to any databases that anyone else is attached to
|
---|
635 | */
|
---|
636 | static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
|
---|
637 | uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
|
---|
638 | {
|
---|
639 | int i, j, db, ret;
|
---|
640 | struct ctdb_dbid_map_old *remote_dbmap;
|
---|
641 |
|
---|
642 | /* verify that we have all database any other node has */
|
---|
643 | for (j=0; j<nodemap->num; j++) {
|
---|
644 | /* we don't need to ourself ourselves */
|
---|
645 | if (nodemap->nodes[j].pnn == pnn) {
|
---|
646 | continue;
|
---|
647 | }
|
---|
648 | /* don't check nodes that are unavailable */
|
---|
649 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
650 | continue;
|
---|
651 | }
|
---|
652 |
|
---|
653 | ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
|
---|
654 | mem_ctx, &remote_dbmap);
|
---|
655 | if (ret != 0) {
|
---|
656 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
|
---|
657 | return -1;
|
---|
658 | }
|
---|
659 |
|
---|
660 | /* step through all databases on the remote node */
|
---|
661 | for (db=0; db<remote_dbmap->num;db++) {
|
---|
662 | const char *name;
|
---|
663 |
|
---|
664 | for (i=0;i<(*dbmap)->num;i++) {
|
---|
665 | if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
|
---|
666 | break;
|
---|
667 | }
|
---|
668 | }
|
---|
669 | /* we already have this db locally */
|
---|
670 | if (i!=(*dbmap)->num) {
|
---|
671 | continue;
|
---|
672 | }
|
---|
673 | /* ok so we need to create this database and
|
---|
674 | rebuild dbmap
|
---|
675 | */
|
---|
676 | ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
|
---|
677 | remote_dbmap->dbs[db].db_id, mem_ctx, &name);
|
---|
678 | if (ret != 0) {
|
---|
679 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
|
---|
680 | nodemap->nodes[j].pnn));
|
---|
681 | return -1;
|
---|
682 | }
|
---|
683 | ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
|
---|
684 | remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
|
---|
685 | if (ret != 0) {
|
---|
686 | DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
|
---|
687 | return -1;
|
---|
688 | }
|
---|
689 | ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
|
---|
690 | if (ret != 0) {
|
---|
691 | DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
|
---|
692 | return -1;
|
---|
693 | }
|
---|
694 | }
|
---|
695 | }
|
---|
696 |
|
---|
697 | return 0;
|
---|
698 | }
|
---|
699 |
|
---|
700 |
|
---|
701 | /*
|
---|
702 | pull the remote database contents from one node into the recdb
|
---|
703 | */
|
---|
704 | static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
|
---|
705 | struct tdb_wrap *recdb, uint32_t dbid)
|
---|
706 | {
|
---|
707 | int ret;
|
---|
708 | TDB_DATA outdata;
|
---|
709 | struct ctdb_marshall_buffer *reply;
|
---|
710 | struct ctdb_rec_data_old *recdata;
|
---|
711 | int i;
|
---|
712 | TALLOC_CTX *tmp_ctx = talloc_new(recdb);
|
---|
713 |
|
---|
714 | ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
|
---|
715 | CONTROL_TIMEOUT(), &outdata);
|
---|
716 | if (ret != 0) {
|
---|
717 | DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
|
---|
718 | talloc_free(tmp_ctx);
|
---|
719 | return -1;
|
---|
720 | }
|
---|
721 |
|
---|
722 | reply = (struct ctdb_marshall_buffer *)outdata.dptr;
|
---|
723 |
|
---|
724 | if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
|
---|
725 | DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
|
---|
726 | talloc_free(tmp_ctx);
|
---|
727 | return -1;
|
---|
728 | }
|
---|
729 |
|
---|
730 | recdata = (struct ctdb_rec_data_old *)&reply->data[0];
|
---|
731 |
|
---|
732 | for (i=0;
|
---|
733 | i<reply->count;
|
---|
734 | recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
|
---|
735 | TDB_DATA key, data;
|
---|
736 | struct ctdb_ltdb_header *hdr;
|
---|
737 | TDB_DATA existing;
|
---|
738 |
|
---|
739 | key.dptr = &recdata->data[0];
|
---|
740 | key.dsize = recdata->keylen;
|
---|
741 | data.dptr = &recdata->data[key.dsize];
|
---|
742 | data.dsize = recdata->datalen;
|
---|
743 |
|
---|
744 | hdr = (struct ctdb_ltdb_header *)data.dptr;
|
---|
745 |
|
---|
746 | if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
|
---|
747 | DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
|
---|
748 | talloc_free(tmp_ctx);
|
---|
749 | return -1;
|
---|
750 | }
|
---|
751 |
|
---|
752 | /* fetch the existing record, if any */
|
---|
753 | existing = tdb_fetch(recdb->tdb, key);
|
---|
754 |
|
---|
755 | if (existing.dptr != NULL) {
|
---|
756 | struct ctdb_ltdb_header header;
|
---|
757 | if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
|
---|
758 | DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
|
---|
759 | (unsigned)existing.dsize, srcnode));
|
---|
760 | free(existing.dptr);
|
---|
761 | talloc_free(tmp_ctx);
|
---|
762 | return -1;
|
---|
763 | }
|
---|
764 | header = *(struct ctdb_ltdb_header *)existing.dptr;
|
---|
765 | free(existing.dptr);
|
---|
766 | if (!(header.rsn < hdr->rsn ||
|
---|
767 | (header.dmaster != ctdb_get_pnn(ctdb) &&
|
---|
768 | header.rsn == hdr->rsn))) {
|
---|
769 | continue;
|
---|
770 | }
|
---|
771 | }
|
---|
772 |
|
---|
773 | if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
|
---|
774 | DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
|
---|
775 | talloc_free(tmp_ctx);
|
---|
776 | return -1;
|
---|
777 | }
|
---|
778 | }
|
---|
779 |
|
---|
780 | talloc_free(tmp_ctx);
|
---|
781 |
|
---|
782 | return 0;
|
---|
783 | }
|
---|
784 |
|
---|
785 |
|
---|
786 | struct pull_seqnum_cbdata {
|
---|
787 | int failed;
|
---|
788 | uint32_t pnn;
|
---|
789 | uint64_t seqnum;
|
---|
790 | };
|
---|
791 |
|
---|
792 | static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
793 | {
|
---|
794 | struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
|
---|
795 | uint64_t seqnum;
|
---|
796 |
|
---|
797 | if (cb_data->failed != 0) {
|
---|
798 | DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
|
---|
799 | return;
|
---|
800 | }
|
---|
801 |
|
---|
802 | if (res != 0) {
|
---|
803 | DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
|
---|
804 | cb_data->failed = 1;
|
---|
805 | return;
|
---|
806 | }
|
---|
807 |
|
---|
808 | if (outdata.dsize != sizeof(uint64_t)) {
|
---|
809 | DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
|
---|
810 | cb_data->failed = -1;
|
---|
811 | return;
|
---|
812 | }
|
---|
813 |
|
---|
814 | seqnum = *((uint64_t *)outdata.dptr);
|
---|
815 |
|
---|
816 | if (seqnum > cb_data->seqnum ||
|
---|
817 | (cb_data->pnn == -1 && seqnum == 0)) {
|
---|
818 | cb_data->seqnum = seqnum;
|
---|
819 | cb_data->pnn = node_pnn;
|
---|
820 | }
|
---|
821 | }
|
---|
822 |
|
---|
823 | static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
824 | {
|
---|
825 | struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
|
---|
826 |
|
---|
827 | DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
|
---|
828 | cb_data->failed = 1;
|
---|
829 | }
|
---|
830 |
|
---|
831 | static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
|
---|
832 | struct ctdb_recoverd *rec,
|
---|
833 | struct ctdb_node_map_old *nodemap,
|
---|
834 | struct tdb_wrap *recdb, uint32_t dbid)
|
---|
835 | {
|
---|
836 | TALLOC_CTX *tmp_ctx = talloc_new(NULL);
|
---|
837 | uint32_t *nodes;
|
---|
838 | TDB_DATA data;
|
---|
839 | uint32_t outdata[2];
|
---|
840 | struct pull_seqnum_cbdata *cb_data;
|
---|
841 |
|
---|
842 | DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
|
---|
843 |
|
---|
844 | outdata[0] = dbid;
|
---|
845 | outdata[1] = 0;
|
---|
846 |
|
---|
847 | data.dsize = sizeof(outdata);
|
---|
848 | data.dptr = (uint8_t *)&outdata[0];
|
---|
849 |
|
---|
850 | cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
|
---|
851 | if (cb_data == NULL) {
|
---|
852 | DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
|
---|
853 | talloc_free(tmp_ctx);
|
---|
854 | return -1;
|
---|
855 | }
|
---|
856 |
|
---|
857 | cb_data->failed = 0;
|
---|
858 | cb_data->pnn = -1;
|
---|
859 | cb_data->seqnum = 0;
|
---|
860 |
|
---|
861 | nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
862 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
|
---|
863 | nodes, 0,
|
---|
864 | CONTROL_TIMEOUT(), false, data,
|
---|
865 | pull_seqnum_cb,
|
---|
866 | pull_seqnum_fail_cb,
|
---|
867 | cb_data) != 0) {
|
---|
868 | DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
|
---|
869 |
|
---|
870 | talloc_free(tmp_ctx);
|
---|
871 | return -1;
|
---|
872 | }
|
---|
873 |
|
---|
874 | if (cb_data->failed != 0) {
|
---|
875 | DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
|
---|
876 | talloc_free(tmp_ctx);
|
---|
877 | return -1;
|
---|
878 | }
|
---|
879 |
|
---|
880 | if (cb_data->pnn == -1) {
|
---|
881 | DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
|
---|
882 | talloc_free(tmp_ctx);
|
---|
883 | return -1;
|
---|
884 | }
|
---|
885 |
|
---|
886 | DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
|
---|
887 |
|
---|
888 | if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
|
---|
889 | DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
|
---|
890 | talloc_free(tmp_ctx);
|
---|
891 | return -1;
|
---|
892 | }
|
---|
893 |
|
---|
894 | talloc_free(tmp_ctx);
|
---|
895 | return 0;
|
---|
896 | }
|
---|
897 |
|
---|
898 |
|
---|
899 | /*
|
---|
900 | pull all the remote database contents into the recdb
|
---|
901 | */
|
---|
902 | static int pull_remote_database(struct ctdb_context *ctdb,
|
---|
903 | struct ctdb_recoverd *rec,
|
---|
904 | struct ctdb_node_map_old *nodemap,
|
---|
905 | struct tdb_wrap *recdb, uint32_t dbid,
|
---|
906 | bool persistent)
|
---|
907 | {
|
---|
908 | int j;
|
---|
909 |
|
---|
910 | if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
|
---|
911 | int ret;
|
---|
912 | ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
|
---|
913 | if (ret == 0) {
|
---|
914 | return 0;
|
---|
915 | }
|
---|
916 | }
|
---|
917 |
|
---|
918 | /* pull all records from all other nodes across onto this node
|
---|
919 | (this merges based on rsn)
|
---|
920 | */
|
---|
921 | for (j=0; j<nodemap->num; j++) {
|
---|
922 | /* don't merge from nodes that are unavailable */
|
---|
923 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
924 | continue;
|
---|
925 | }
|
---|
926 | if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
|
---|
927 | DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
|
---|
928 | nodemap->nodes[j].pnn));
|
---|
929 | ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
|
---|
930 | return -1;
|
---|
931 | }
|
---|
932 | }
|
---|
933 |
|
---|
934 | return 0;
|
---|
935 | }
|
---|
936 |
|
---|
937 |
|
---|
938 | /*
|
---|
939 | update flags on all active nodes
|
---|
940 | */
|
---|
941 | static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
|
---|
942 | {
|
---|
943 | int ret;
|
---|
944 |
|
---|
945 | ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
|
---|
946 | if (ret != 0) {
|
---|
947 | DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
|
---|
948 | return -1;
|
---|
949 | }
|
---|
950 |
|
---|
951 | return 0;
|
---|
952 | }
|
---|
953 |
|
---|
954 | /*
|
---|
955 | ensure all nodes have the same vnnmap we do
|
---|
956 | */
|
---|
957 | static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
|
---|
958 | uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
|
---|
959 | {
|
---|
960 | int j, ret;
|
---|
961 |
|
---|
962 | /* push the new vnn map out to all the nodes */
|
---|
963 | for (j=0; j<nodemap->num; j++) {
|
---|
964 | /* don't push to nodes that are unavailable */
|
---|
965 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
966 | continue;
|
---|
967 | }
|
---|
968 |
|
---|
969 | ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
|
---|
970 | if (ret != 0) {
|
---|
971 | DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
|
---|
972 | return -1;
|
---|
973 | }
|
---|
974 | }
|
---|
975 |
|
---|
976 | return 0;
|
---|
977 | }
|
---|
978 |
|
---|
979 |
|
---|
980 | /*
|
---|
981 | called when a vacuum fetch has completed - just free it and do the next one
|
---|
982 | */
|
---|
983 | static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
|
---|
984 | {
|
---|
985 | talloc_free(state);
|
---|
986 | }
|
---|
987 |
|
---|
988 |
|
---|
989 | /**
|
---|
990 | * Process one elements of the vacuum fetch list:
|
---|
991 | * Migrate it over to us with the special flag
|
---|
992 | * CTDB_CALL_FLAG_VACUUM_MIGRATION.
|
---|
993 | */
|
---|
994 | static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
|
---|
995 | uint32_t pnn,
|
---|
996 | struct ctdb_rec_data_old *r)
|
---|
997 | {
|
---|
998 | struct ctdb_client_call_state *state;
|
---|
999 | TDB_DATA data;
|
---|
1000 | struct ctdb_ltdb_header *hdr;
|
---|
1001 | struct ctdb_call call;
|
---|
1002 |
|
---|
1003 | ZERO_STRUCT(call);
|
---|
1004 | call.call_id = CTDB_NULL_FUNC;
|
---|
1005 | call.flags = CTDB_IMMEDIATE_MIGRATION;
|
---|
1006 | call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
|
---|
1007 |
|
---|
1008 | call.key.dptr = &r->data[0];
|
---|
1009 | call.key.dsize = r->keylen;
|
---|
1010 |
|
---|
1011 | /* ensure we don't block this daemon - just skip a record if we can't get
|
---|
1012 | the chainlock */
|
---|
1013 | if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
|
---|
1014 | return true;
|
---|
1015 | }
|
---|
1016 |
|
---|
1017 | data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
|
---|
1018 | if (data.dptr == NULL) {
|
---|
1019 | tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
|
---|
1020 | return true;
|
---|
1021 | }
|
---|
1022 |
|
---|
1023 | if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
|
---|
1024 | free(data.dptr);
|
---|
1025 | tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
|
---|
1026 | return true;
|
---|
1027 | }
|
---|
1028 |
|
---|
1029 | hdr = (struct ctdb_ltdb_header *)data.dptr;
|
---|
1030 | if (hdr->dmaster == pnn) {
|
---|
1031 | /* its already local */
|
---|
1032 | free(data.dptr);
|
---|
1033 | tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
|
---|
1034 | return true;
|
---|
1035 | }
|
---|
1036 |
|
---|
1037 | free(data.dptr);
|
---|
1038 |
|
---|
1039 | state = ctdb_call_send(ctdb_db, &call);
|
---|
1040 | tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
|
---|
1041 | if (state == NULL) {
|
---|
1042 | DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
|
---|
1043 | return false;
|
---|
1044 | }
|
---|
1045 | state->async.fn = vacuum_fetch_callback;
|
---|
1046 | state->async.private_data = NULL;
|
---|
1047 |
|
---|
1048 | return true;
|
---|
1049 | }
|
---|
1050 |
|
---|
1051 |
|
---|
1052 | /*
|
---|
1053 | handler for vacuum fetch
|
---|
1054 | */
|
---|
1055 | static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
|
---|
1056 | void *private_data)
|
---|
1057 | {
|
---|
1058 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
1059 | private_data, struct ctdb_recoverd);
|
---|
1060 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1061 | struct ctdb_marshall_buffer *recs;
|
---|
1062 | int ret, i;
|
---|
1063 | TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
|
---|
1064 | const char *name;
|
---|
1065 | struct ctdb_dbid_map_old *dbmap=NULL;
|
---|
1066 | bool persistent = false;
|
---|
1067 | struct ctdb_db_context *ctdb_db;
|
---|
1068 | struct ctdb_rec_data_old *r;
|
---|
1069 |
|
---|
1070 | recs = (struct ctdb_marshall_buffer *)data.dptr;
|
---|
1071 |
|
---|
1072 | if (recs->count == 0) {
|
---|
1073 | goto done;
|
---|
1074 | }
|
---|
1075 |
|
---|
1076 | /* work out if the database is persistent */
|
---|
1077 | ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
|
---|
1078 | if (ret != 0) {
|
---|
1079 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
|
---|
1080 | goto done;
|
---|
1081 | }
|
---|
1082 |
|
---|
1083 | for (i=0;i<dbmap->num;i++) {
|
---|
1084 | if (dbmap->dbs[i].db_id == recs->db_id) {
|
---|
1085 | persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
|
---|
1086 | break;
|
---|
1087 | }
|
---|
1088 | }
|
---|
1089 | if (i == dbmap->num) {
|
---|
1090 | DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
|
---|
1091 | goto done;
|
---|
1092 | }
|
---|
1093 |
|
---|
1094 | /* find the name of this database */
|
---|
1095 | if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
|
---|
1096 | DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
|
---|
1097 | goto done;
|
---|
1098 | }
|
---|
1099 |
|
---|
1100 | /* attach to it */
|
---|
1101 | ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
|
---|
1102 | if (ctdb_db == NULL) {
|
---|
1103 | DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
|
---|
1104 | goto done;
|
---|
1105 | }
|
---|
1106 |
|
---|
1107 | r = (struct ctdb_rec_data_old *)&recs->data[0];
|
---|
1108 | while (recs->count) {
|
---|
1109 | bool ok;
|
---|
1110 |
|
---|
1111 | ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
|
---|
1112 | if (!ok) {
|
---|
1113 | break;
|
---|
1114 | }
|
---|
1115 |
|
---|
1116 | r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
|
---|
1117 | recs->count--;
|
---|
1118 | }
|
---|
1119 |
|
---|
1120 | done:
|
---|
1121 | talloc_free(tmp_ctx);
|
---|
1122 | }
|
---|
1123 |
|
---|
1124 |
|
---|
1125 | /*
|
---|
1126 | * handler for database detach
|
---|
1127 | */
|
---|
1128 | static void detach_database_handler(uint64_t srvid, TDB_DATA data,
|
---|
1129 | void *private_data)
|
---|
1130 | {
|
---|
1131 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
1132 | private_data, struct ctdb_recoverd);
|
---|
1133 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1134 | uint32_t db_id;
|
---|
1135 | struct ctdb_db_context *ctdb_db;
|
---|
1136 |
|
---|
1137 | if (data.dsize != sizeof(db_id)) {
|
---|
1138 | return;
|
---|
1139 | }
|
---|
1140 | db_id = *(uint32_t *)data.dptr;
|
---|
1141 |
|
---|
1142 | ctdb_db = find_ctdb_db(ctdb, db_id);
|
---|
1143 | if (ctdb_db == NULL) {
|
---|
1144 | /* database is not attached */
|
---|
1145 | return;
|
---|
1146 | }
|
---|
1147 |
|
---|
1148 | DLIST_REMOVE(ctdb->db_list, ctdb_db);
|
---|
1149 |
|
---|
1150 | DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
|
---|
1151 | ctdb_db->db_name));
|
---|
1152 | talloc_free(ctdb_db);
|
---|
1153 | }
|
---|
1154 |
|
---|
1155 | /*
|
---|
1156 | called when ctdb_wait_timeout should finish
|
---|
1157 | */
|
---|
1158 | static void ctdb_wait_handler(struct tevent_context *ev,
|
---|
1159 | struct tevent_timer *te,
|
---|
1160 | struct timeval yt, void *p)
|
---|
1161 | {
|
---|
1162 | uint32_t *timed_out = (uint32_t *)p;
|
---|
1163 | (*timed_out) = 1;
|
---|
1164 | }
|
---|
1165 |
|
---|
1166 | /*
|
---|
1167 | wait for a given number of seconds
|
---|
1168 | */
|
---|
1169 | static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
|
---|
1170 | {
|
---|
1171 | uint32_t timed_out = 0;
|
---|
1172 | time_t usecs = (secs - (time_t)secs) * 1000000;
|
---|
1173 | tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
|
---|
1174 | ctdb_wait_handler, &timed_out);
|
---|
1175 | while (!timed_out) {
|
---|
1176 | tevent_loop_once(ctdb->ev);
|
---|
1177 | }
|
---|
1178 | }
|
---|
1179 |
|
---|
1180 | /*
|
---|
1181 | called when an election times out (ends)
|
---|
1182 | */
|
---|
1183 | static void ctdb_election_timeout(struct tevent_context *ev,
|
---|
1184 | struct tevent_timer *te,
|
---|
1185 | struct timeval t, void *p)
|
---|
1186 | {
|
---|
1187 | struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
|
---|
1188 | rec->election_timeout = NULL;
|
---|
1189 | fast_start = false;
|
---|
1190 |
|
---|
1191 | DEBUG(DEBUG_WARNING,("Election period ended\n"));
|
---|
1192 | }
|
---|
1193 |
|
---|
1194 |
|
---|
1195 | /*
|
---|
1196 | wait for an election to finish. It finished election_timeout seconds after
|
---|
1197 | the last election packet is received
|
---|
1198 | */
|
---|
1199 | static void ctdb_wait_election(struct ctdb_recoverd *rec)
|
---|
1200 | {
|
---|
1201 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1202 | while (rec->election_timeout) {
|
---|
1203 | tevent_loop_once(ctdb->ev);
|
---|
1204 | }
|
---|
1205 | }
|
---|
1206 |
|
---|
1207 | /*
|
---|
1208 | Update our local flags from all remote connected nodes.
|
---|
1209 | This is only run when we are or we belive we are the recovery master
|
---|
1210 | */
|
---|
1211 | static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
|
---|
1212 | {
|
---|
1213 | int j;
|
---|
1214 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1215 | TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
---|
1216 |
|
---|
1217 | /* get the nodemap for all active remote nodes and verify
|
---|
1218 | they are the same as for this node
|
---|
1219 | */
|
---|
1220 | for (j=0; j<nodemap->num; j++) {
|
---|
1221 | struct ctdb_node_map_old *remote_nodemap=NULL;
|
---|
1222 | int ret;
|
---|
1223 |
|
---|
1224 | if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
|
---|
1225 | continue;
|
---|
1226 | }
|
---|
1227 | if (nodemap->nodes[j].pnn == ctdb->pnn) {
|
---|
1228 | continue;
|
---|
1229 | }
|
---|
1230 |
|
---|
1231 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
|
---|
1232 | mem_ctx, &remote_nodemap);
|
---|
1233 | if (ret != 0) {
|
---|
1234 | DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
|
---|
1235 | nodemap->nodes[j].pnn));
|
---|
1236 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
1237 | talloc_free(mem_ctx);
|
---|
1238 | return MONITOR_FAILED;
|
---|
1239 | }
|
---|
1240 | if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
|
---|
1241 | /* We should tell our daemon about this so it
|
---|
1242 | updates its flags or else we will log the same
|
---|
1243 | message again in the next iteration of recovery.
|
---|
1244 | Since we are the recovery master we can just as
|
---|
1245 | well update the flags on all nodes.
|
---|
1246 | */
|
---|
1247 | ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
|
---|
1248 | if (ret != 0) {
|
---|
1249 | DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
|
---|
1250 | return -1;
|
---|
1251 | }
|
---|
1252 |
|
---|
1253 | /* Update our local copy of the flags in the recovery
|
---|
1254 | daemon.
|
---|
1255 | */
|
---|
1256 | DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
|
---|
1257 | nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
|
---|
1258 | nodemap->nodes[j].flags));
|
---|
1259 | nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
|
---|
1260 | }
|
---|
1261 | talloc_free(remote_nodemap);
|
---|
1262 | }
|
---|
1263 | talloc_free(mem_ctx);
|
---|
1264 | return MONITOR_OK;
|
---|
1265 | }
|
---|
1266 |
|
---|
1267 |
|
---|
1268 | /* Create a new random generation id.
|
---|
1269 | The generation id can not be the INVALID_GENERATION id
|
---|
1270 | */
|
---|
1271 | static uint32_t new_generation(void)
|
---|
1272 | {
|
---|
1273 | uint32_t generation;
|
---|
1274 |
|
---|
1275 | while (1) {
|
---|
1276 | generation = random();
|
---|
1277 |
|
---|
1278 | if (generation != INVALID_GENERATION) {
|
---|
1279 | break;
|
---|
1280 | }
|
---|
1281 | }
|
---|
1282 |
|
---|
1283 | return generation;
|
---|
1284 | }
|
---|
1285 |
|
---|
1286 |
|
---|
1287 | /*
|
---|
1288 | create a temporary working database
|
---|
1289 | */
|
---|
1290 | static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
|
---|
1291 | {
|
---|
1292 | char *name;
|
---|
1293 | struct tdb_wrap *recdb;
|
---|
1294 | unsigned tdb_flags;
|
---|
1295 |
|
---|
1296 | /* open up the temporary recovery database */
|
---|
1297 | name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
|
---|
1298 | ctdb->db_directory_state,
|
---|
1299 | ctdb->pnn);
|
---|
1300 | if (name == NULL) {
|
---|
1301 | return NULL;
|
---|
1302 | }
|
---|
1303 | unlink(name);
|
---|
1304 |
|
---|
1305 | tdb_flags = TDB_NOLOCK;
|
---|
1306 | if (ctdb->valgrinding) {
|
---|
1307 | tdb_flags |= TDB_NOMMAP;
|
---|
1308 | }
|
---|
1309 | tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
|
---|
1310 |
|
---|
1311 | recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
|
---|
1312 | tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
|
---|
1313 | if (recdb == NULL) {
|
---|
1314 | DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
|
---|
1315 | }
|
---|
1316 |
|
---|
1317 | talloc_free(name);
|
---|
1318 |
|
---|
1319 | return recdb;
|
---|
1320 | }
|
---|
1321 |
|
---|
1322 |
|
---|
1323 | /*
|
---|
1324 | a traverse function for pulling all relevant records from recdb
|
---|
1325 | */
|
---|
1326 | struct recdb_data {
|
---|
1327 | struct ctdb_context *ctdb;
|
---|
1328 | struct ctdb_marshall_buffer *recdata;
|
---|
1329 | uint32_t len;
|
---|
1330 | uint32_t allocated_len;
|
---|
1331 | bool failed;
|
---|
1332 | bool persistent;
|
---|
1333 | };
|
---|
1334 |
|
---|
1335 | static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
|
---|
1336 | {
|
---|
1337 | struct recdb_data *params = (struct recdb_data *)p;
|
---|
1338 | struct ctdb_rec_data_old *recdata;
|
---|
1339 | struct ctdb_ltdb_header *hdr;
|
---|
1340 |
|
---|
1341 | /*
|
---|
1342 | * skip empty records - but NOT for persistent databases:
|
---|
1343 | *
|
---|
1344 | * The record-by-record mode of recovery deletes empty records.
|
---|
1345 | * For persistent databases, this can lead to data corruption
|
---|
1346 | * by deleting records that should be there:
|
---|
1347 | *
|
---|
1348 | * - Assume the cluster has been running for a while.
|
---|
1349 | *
|
---|
1350 | * - A record R in a persistent database has been created and
|
---|
1351 | * deleted a couple of times, the last operation being deletion,
|
---|
1352 | * leaving an empty record with a high RSN, say 10.
|
---|
1353 | *
|
---|
1354 | * - Now a node N is turned off.
|
---|
1355 | *
|
---|
1356 | * - This leaves the local database copy of D on N with the empty
|
---|
1357 | * copy of R and RSN 10. On all other nodes, the recovery has deleted
|
---|
1358 | * the copy of record R.
|
---|
1359 | *
|
---|
1360 | * - Now the record is created again while node N is turned off.
|
---|
1361 | * This creates R with RSN = 1 on all nodes except for N.
|
---|
1362 | *
|
---|
1363 | * - Now node N is turned on again. The following recovery will chose
|
---|
1364 | * the older empty copy of R due to RSN 10 > RSN 1.
|
---|
1365 | *
|
---|
1366 | * ==> Hence the record is gone after the recovery.
|
---|
1367 | *
|
---|
1368 | * On databases like Samba's registry, this can damage the higher-level
|
---|
1369 | * data structures built from the various tdb-level records.
|
---|
1370 | */
|
---|
1371 | if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
|
---|
1372 | return 0;
|
---|
1373 | }
|
---|
1374 |
|
---|
1375 | /* update the dmaster field to point to us */
|
---|
1376 | hdr = (struct ctdb_ltdb_header *)data.dptr;
|
---|
1377 | if (!params->persistent) {
|
---|
1378 | hdr->dmaster = params->ctdb->pnn;
|
---|
1379 | hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
|
---|
1380 | }
|
---|
1381 |
|
---|
1382 | /* add the record to the blob ready to send to the nodes */
|
---|
1383 | recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
|
---|
1384 | if (recdata == NULL) {
|
---|
1385 | params->failed = true;
|
---|
1386 | return -1;
|
---|
1387 | }
|
---|
1388 | if (params->len + recdata->length >= params->allocated_len) {
|
---|
1389 | params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
|
---|
1390 | params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
|
---|
1391 | }
|
---|
1392 | if (params->recdata == NULL) {
|
---|
1393 | DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
|
---|
1394 | recdata->length + params->len));
|
---|
1395 | params->failed = true;
|
---|
1396 | return -1;
|
---|
1397 | }
|
---|
1398 | params->recdata->count++;
|
---|
1399 | memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
|
---|
1400 | params->len += recdata->length;
|
---|
1401 | talloc_free(recdata);
|
---|
1402 |
|
---|
1403 | return 0;
|
---|
1404 | }
|
---|
1405 |
|
---|
1406 | /*
|
---|
1407 | push the recdb database out to all nodes
|
---|
1408 | */
|
---|
1409 | static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
|
---|
1410 | bool persistent,
|
---|
1411 | struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
|
---|
1412 | {
|
---|
1413 | struct recdb_data params;
|
---|
1414 | struct ctdb_marshall_buffer *recdata;
|
---|
1415 | TDB_DATA outdata;
|
---|
1416 | TALLOC_CTX *tmp_ctx;
|
---|
1417 | uint32_t *nodes;
|
---|
1418 |
|
---|
1419 | tmp_ctx = talloc_new(ctdb);
|
---|
1420 | CTDB_NO_MEMORY(ctdb, tmp_ctx);
|
---|
1421 |
|
---|
1422 | recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
|
---|
1423 | CTDB_NO_MEMORY(ctdb, recdata);
|
---|
1424 |
|
---|
1425 | recdata->db_id = dbid;
|
---|
1426 |
|
---|
1427 | params.ctdb = ctdb;
|
---|
1428 | params.recdata = recdata;
|
---|
1429 | params.len = offsetof(struct ctdb_marshall_buffer, data);
|
---|
1430 | params.allocated_len = params.len;
|
---|
1431 | params.failed = false;
|
---|
1432 | params.persistent = persistent;
|
---|
1433 |
|
---|
1434 | if (tdb_traverse_read(recdb->tdb, traverse_recdb, ¶ms) == -1) {
|
---|
1435 | DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
|
---|
1436 | talloc_free(params.recdata);
|
---|
1437 | talloc_free(tmp_ctx);
|
---|
1438 | return -1;
|
---|
1439 | }
|
---|
1440 |
|
---|
1441 | if (params.failed) {
|
---|
1442 | DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
|
---|
1443 | talloc_free(params.recdata);
|
---|
1444 | talloc_free(tmp_ctx);
|
---|
1445 | return -1;
|
---|
1446 | }
|
---|
1447 |
|
---|
1448 | recdata = params.recdata;
|
---|
1449 |
|
---|
1450 | outdata.dptr = (void *)recdata;
|
---|
1451 | outdata.dsize = params.len;
|
---|
1452 |
|
---|
1453 | nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
1454 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
|
---|
1455 | nodes, 0,
|
---|
1456 | CONTROL_TIMEOUT(), false, outdata,
|
---|
1457 | NULL, NULL,
|
---|
1458 | NULL) != 0) {
|
---|
1459 | DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
|
---|
1460 | talloc_free(recdata);
|
---|
1461 | talloc_free(tmp_ctx);
|
---|
1462 | return -1;
|
---|
1463 | }
|
---|
1464 |
|
---|
1465 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
|
---|
1466 | dbid, recdata->count));
|
---|
1467 |
|
---|
1468 | talloc_free(recdata);
|
---|
1469 | talloc_free(tmp_ctx);
|
---|
1470 |
|
---|
1471 | return 0;
|
---|
1472 | }
|
---|
1473 |
|
---|
1474 |
|
---|
1475 | /*
|
---|
1476 | go through a full recovery on one database
|
---|
1477 | */
|
---|
1478 | static int recover_database(struct ctdb_recoverd *rec,
|
---|
1479 | TALLOC_CTX *mem_ctx,
|
---|
1480 | uint32_t dbid,
|
---|
1481 | bool persistent,
|
---|
1482 | uint32_t pnn,
|
---|
1483 | struct ctdb_node_map_old *nodemap,
|
---|
1484 | uint32_t transaction_id)
|
---|
1485 | {
|
---|
1486 | struct tdb_wrap *recdb;
|
---|
1487 | int ret;
|
---|
1488 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1489 | TDB_DATA data;
|
---|
1490 | struct ctdb_transdb w;
|
---|
1491 | uint32_t *nodes;
|
---|
1492 |
|
---|
1493 | recdb = create_recdb(ctdb, mem_ctx);
|
---|
1494 | if (recdb == NULL) {
|
---|
1495 | return -1;
|
---|
1496 | }
|
---|
1497 |
|
---|
1498 | /* pull all remote databases onto the recdb */
|
---|
1499 | ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
|
---|
1500 | if (ret != 0) {
|
---|
1501 | DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
|
---|
1502 | return -1;
|
---|
1503 | }
|
---|
1504 |
|
---|
1505 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
|
---|
1506 |
|
---|
1507 | /* wipe all the remote databases. This is safe as we are in a transaction */
|
---|
1508 | w.db_id = dbid;
|
---|
1509 | w.tid = transaction_id;
|
---|
1510 |
|
---|
1511 | data.dptr = (void *)&w;
|
---|
1512 | data.dsize = sizeof(w);
|
---|
1513 |
|
---|
1514 | nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
|
---|
1515 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
|
---|
1516 | nodes, 0,
|
---|
1517 | CONTROL_TIMEOUT(), false, data,
|
---|
1518 | NULL, NULL,
|
---|
1519 | NULL) != 0) {
|
---|
1520 | DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
|
---|
1521 | talloc_free(recdb);
|
---|
1522 | return -1;
|
---|
1523 | }
|
---|
1524 |
|
---|
1525 | /* push out the correct database. This sets the dmaster and skips
|
---|
1526 | the empty records */
|
---|
1527 | ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
|
---|
1528 | if (ret != 0) {
|
---|
1529 | talloc_free(recdb);
|
---|
1530 | return -1;
|
---|
1531 | }
|
---|
1532 |
|
---|
1533 | /* all done with this database */
|
---|
1534 | talloc_free(recdb);
|
---|
1535 |
|
---|
1536 | return 0;
|
---|
1537 | }
|
---|
1538 |
|
---|
1539 | /* when we start a recovery, make sure all nodes use the same reclock file
|
---|
1540 | setting
|
---|
1541 | */
|
---|
1542 | static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
|
---|
1543 | {
|
---|
1544 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1545 | TALLOC_CTX *tmp_ctx = talloc_new(NULL);
|
---|
1546 | TDB_DATA data;
|
---|
1547 | uint32_t *nodes;
|
---|
1548 |
|
---|
1549 | if (ctdb->recovery_lock_file == NULL) {
|
---|
1550 | data.dptr = NULL;
|
---|
1551 | data.dsize = 0;
|
---|
1552 | } else {
|
---|
1553 | data.dsize = strlen(ctdb->recovery_lock_file) + 1;
|
---|
1554 | data.dptr = (uint8_t *)ctdb->recovery_lock_file;
|
---|
1555 | }
|
---|
1556 |
|
---|
1557 | nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
|
---|
1558 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
|
---|
1559 | nodes, 0,
|
---|
1560 | CONTROL_TIMEOUT(),
|
---|
1561 | false, data,
|
---|
1562 | NULL, NULL,
|
---|
1563 | rec) != 0) {
|
---|
1564 | DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
|
---|
1565 | talloc_free(tmp_ctx);
|
---|
1566 | return -1;
|
---|
1567 | }
|
---|
1568 |
|
---|
1569 | talloc_free(tmp_ctx);
|
---|
1570 | return 0;
|
---|
1571 | }
|
---|
1572 |
|
---|
1573 |
|
---|
1574 | /*
|
---|
1575 | * this callback is called for every node that failed to execute ctdb_takeover_run()
|
---|
1576 | * and set flag to re-run takeover run.
|
---|
1577 | */
|
---|
1578 | static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
1579 | {
|
---|
1580 | DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
|
---|
1581 |
|
---|
1582 | if (callback_data != NULL) {
|
---|
1583 | struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
|
---|
1584 |
|
---|
1585 | DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
|
---|
1586 |
|
---|
1587 | ctdb_set_culprit(rec, node_pnn);
|
---|
1588 | }
|
---|
1589 | }
|
---|
1590 |
|
---|
1591 |
|
---|
1592 | static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
|
---|
1593 | {
|
---|
1594 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1595 | int i;
|
---|
1596 | struct ctdb_banning_state *ban_state;
|
---|
1597 |
|
---|
1598 | *self_ban = false;
|
---|
1599 | for (i=0; i<ctdb->num_nodes; i++) {
|
---|
1600 | if (ctdb->nodes[i]->ban_state == NULL) {
|
---|
1601 | continue;
|
---|
1602 | }
|
---|
1603 | ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
|
---|
1604 | if (ban_state->count < 2*ctdb->num_nodes) {
|
---|
1605 | continue;
|
---|
1606 | }
|
---|
1607 |
|
---|
1608 | DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
|
---|
1609 | ctdb->nodes[i]->pnn, ban_state->count,
|
---|
1610 | ctdb->tunable.recovery_ban_period));
|
---|
1611 | ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
|
---|
1612 | ban_state->count = 0;
|
---|
1613 |
|
---|
1614 | /* Banning ourself? */
|
---|
1615 | if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
|
---|
1616 | *self_ban = true;
|
---|
1617 | }
|
---|
1618 | }
|
---|
1619 | }
|
---|
1620 |
|
---|
1621 | static bool do_takeover_run(struct ctdb_recoverd *rec,
|
---|
1622 | struct ctdb_node_map_old *nodemap,
|
---|
1623 | bool banning_credits_on_fail)
|
---|
1624 | {
|
---|
1625 | uint32_t *nodes = NULL;
|
---|
1626 | struct ctdb_disable_message dtr;
|
---|
1627 | TDB_DATA data;
|
---|
1628 | int i;
|
---|
1629 | uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
|
---|
1630 | int ret;
|
---|
1631 | bool ok;
|
---|
1632 |
|
---|
1633 | DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
|
---|
1634 |
|
---|
1635 | if (ctdb_op_is_in_progress(rec->takeover_run)) {
|
---|
1636 | DEBUG(DEBUG_ERR, (__location__
|
---|
1637 | " takeover run already in progress \n"));
|
---|
1638 | ok = false;
|
---|
1639 | goto done;
|
---|
1640 | }
|
---|
1641 |
|
---|
1642 | if (!ctdb_op_begin(rec->takeover_run)) {
|
---|
1643 | ok = false;
|
---|
1644 | goto done;
|
---|
1645 | }
|
---|
1646 |
|
---|
1647 | /* Disable IP checks (takeover runs, really) on other nodes
|
---|
1648 | * while doing this takeover run. This will stop those other
|
---|
1649 | * nodes from triggering takeover runs when think they should
|
---|
1650 | * be hosting an IP but it isn't yet on an interface. Don't
|
---|
1651 | * wait for replies since a failure here might cause some
|
---|
1652 | * noise in the logs but will not actually cause a problem.
|
---|
1653 | */
|
---|
1654 | ZERO_STRUCT(dtr);
|
---|
1655 | dtr.srvid = 0; /* No reply */
|
---|
1656 | dtr.pnn = -1;
|
---|
1657 |
|
---|
1658 | data.dptr = (uint8_t*)&dtr;
|
---|
1659 | data.dsize = sizeof(dtr);
|
---|
1660 |
|
---|
1661 | nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
|
---|
1662 |
|
---|
1663 | /* Disable for 60 seconds. This can be a tunable later if
|
---|
1664 | * necessary.
|
---|
1665 | */
|
---|
1666 | dtr.timeout = 60;
|
---|
1667 | for (i = 0; i < talloc_array_length(nodes); i++) {
|
---|
1668 | if (ctdb_client_send_message(rec->ctdb, nodes[i],
|
---|
1669 | CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
|
---|
1670 | data) != 0) {
|
---|
1671 | DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
|
---|
1672 | }
|
---|
1673 | }
|
---|
1674 |
|
---|
1675 | ret = ctdb_takeover_run(rec->ctdb, nodemap,
|
---|
1676 | rec->force_rebalance_nodes,
|
---|
1677 | takeover_fail_callback,
|
---|
1678 | banning_credits_on_fail ? rec : NULL);
|
---|
1679 |
|
---|
1680 | /* Reenable takeover runs and IP checks on other nodes */
|
---|
1681 | dtr.timeout = 0;
|
---|
1682 | for (i = 0; i < talloc_array_length(nodes); i++) {
|
---|
1683 | if (ctdb_client_send_message(rec->ctdb, nodes[i],
|
---|
1684 | CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
|
---|
1685 | data) != 0) {
|
---|
1686 | DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
|
---|
1687 | }
|
---|
1688 | }
|
---|
1689 |
|
---|
1690 | if (ret != 0) {
|
---|
1691 | DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
|
---|
1692 | ok = false;
|
---|
1693 | goto done;
|
---|
1694 | }
|
---|
1695 |
|
---|
1696 | ok = true;
|
---|
1697 | /* Takeover run was successful so clear force rebalance targets */
|
---|
1698 | if (rebalance_nodes == rec->force_rebalance_nodes) {
|
---|
1699 | TALLOC_FREE(rec->force_rebalance_nodes);
|
---|
1700 | } else {
|
---|
1701 | DEBUG(DEBUG_WARNING,
|
---|
1702 | ("Rebalance target nodes changed during takeover run - not clearing\n"));
|
---|
1703 | }
|
---|
1704 | done:
|
---|
1705 | rec->need_takeover_run = !ok;
|
---|
1706 | talloc_free(nodes);
|
---|
1707 | ctdb_op_end(rec->takeover_run);
|
---|
1708 |
|
---|
1709 | DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
|
---|
1710 | return ok;
|
---|
1711 | }
|
---|
1712 |
|
---|
1713 | struct recovery_helper_state {
|
---|
1714 | int fd[2];
|
---|
1715 | pid_t pid;
|
---|
1716 | int result;
|
---|
1717 | bool done;
|
---|
1718 | };
|
---|
1719 |
|
---|
1720 | static void ctdb_recovery_handler(struct tevent_context *ev,
|
---|
1721 | struct tevent_fd *fde,
|
---|
1722 | uint16_t flags, void *private_data)
|
---|
1723 | {
|
---|
1724 | struct recovery_helper_state *state = talloc_get_type_abort(
|
---|
1725 | private_data, struct recovery_helper_state);
|
---|
1726 | int ret;
|
---|
1727 |
|
---|
1728 | ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
|
---|
1729 | if (ret != sizeof(state->result)) {
|
---|
1730 | state->result = EPIPE;
|
---|
1731 | }
|
---|
1732 |
|
---|
1733 | state->done = true;
|
---|
1734 | }
|
---|
1735 |
|
---|
1736 |
|
---|
1737 | static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
|
---|
1738 | {
|
---|
1739 | static char prog[PATH_MAX+1] = "";
|
---|
1740 | const char **args;
|
---|
1741 | struct recovery_helper_state *state;
|
---|
1742 | struct tevent_fd *fde;
|
---|
1743 | int nargs, ret;
|
---|
1744 |
|
---|
1745 | if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
|
---|
1746 | "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
|
---|
1747 | "ctdb_recovery_helper")) {
|
---|
1748 | ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
|
---|
1749 | }
|
---|
1750 |
|
---|
1751 | state = talloc_zero(mem_ctx, struct recovery_helper_state);
|
---|
1752 | if (state == NULL) {
|
---|
1753 | DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
|
---|
1754 | return -1;
|
---|
1755 | }
|
---|
1756 |
|
---|
1757 | state->pid = -1;
|
---|
1758 |
|
---|
1759 | ret = pipe(state->fd);
|
---|
1760 | if (ret != 0) {
|
---|
1761 | DEBUG(DEBUG_ERR,
|
---|
1762 | ("Failed to create pipe for recovery helper\n"));
|
---|
1763 | goto fail;
|
---|
1764 | }
|
---|
1765 |
|
---|
1766 | set_close_on_exec(state->fd[0]);
|
---|
1767 |
|
---|
1768 | nargs = 4;
|
---|
1769 | args = talloc_array(state, const char *, nargs);
|
---|
1770 | if (args == NULL) {
|
---|
1771 | DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
|
---|
1772 | goto fail;
|
---|
1773 | }
|
---|
1774 |
|
---|
1775 | args[0] = talloc_asprintf(args, "%d", state->fd[1]);
|
---|
1776 | args[1] = rec->ctdb->daemon.name;
|
---|
1777 | args[2] = talloc_asprintf(args, "%u", new_generation());
|
---|
1778 | args[3] = NULL;
|
---|
1779 |
|
---|
1780 | if (args[0] == NULL || args[2] == NULL) {
|
---|
1781 | DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
|
---|
1782 | goto fail;
|
---|
1783 | }
|
---|
1784 |
|
---|
1785 | setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
|
---|
1786 |
|
---|
1787 | if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
|
---|
1788 | args, NULL, NULL, &state->pid)) {
|
---|
1789 | DEBUG(DEBUG_ERR,
|
---|
1790 | ("Failed to create child for recovery helper\n"));
|
---|
1791 | goto fail;
|
---|
1792 | }
|
---|
1793 |
|
---|
1794 | close(state->fd[1]);
|
---|
1795 | state->fd[1] = -1;
|
---|
1796 |
|
---|
1797 | state->done = false;
|
---|
1798 |
|
---|
1799 | fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
|
---|
1800 | TEVENT_FD_READ, ctdb_recovery_handler, state);
|
---|
1801 | if (fde == NULL) {
|
---|
1802 | goto fail;
|
---|
1803 | }
|
---|
1804 | tevent_fd_set_auto_close(fde);
|
---|
1805 |
|
---|
1806 | while (!state->done) {
|
---|
1807 | tevent_loop_once(rec->ctdb->ev);
|
---|
1808 | }
|
---|
1809 |
|
---|
1810 | close(state->fd[0]);
|
---|
1811 | state->fd[0] = -1;
|
---|
1812 |
|
---|
1813 | if (state->result != 0) {
|
---|
1814 | goto fail;
|
---|
1815 | }
|
---|
1816 |
|
---|
1817 | ctdb_kill(rec->ctdb, state->pid, SIGKILL);
|
---|
1818 | talloc_free(state);
|
---|
1819 | return 0;
|
---|
1820 |
|
---|
1821 | fail:
|
---|
1822 | if (state->fd[0] != -1) {
|
---|
1823 | close(state->fd[0]);
|
---|
1824 | }
|
---|
1825 | if (state->fd[1] != -1) {
|
---|
1826 | close(state->fd[1]);
|
---|
1827 | }
|
---|
1828 | if (state->pid != -1) {
|
---|
1829 | ctdb_kill(rec->ctdb, state->pid, SIGKILL);
|
---|
1830 | }
|
---|
1831 | talloc_free(state);
|
---|
1832 | return -1;
|
---|
1833 | }
|
---|
1834 |
|
---|
1835 | static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
|
---|
1836 | uint32_t pnn, struct ctdb_node_map_old *nodemap,
|
---|
1837 | struct ctdb_vnn_map *vnnmap,
|
---|
1838 | struct ctdb_dbid_map_old *dbmap)
|
---|
1839 | {
|
---|
1840 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
1841 | uint32_t generation;
|
---|
1842 | TDB_DATA data;
|
---|
1843 | uint32_t *nodes;
|
---|
1844 | int ret, i, j;
|
---|
1845 |
|
---|
1846 | /* set recovery mode to active on all nodes */
|
---|
1847 | ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
|
---|
1848 | if (ret != 0) {
|
---|
1849 | DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
|
---|
1850 | return -1;
|
---|
1851 | }
|
---|
1852 |
|
---|
1853 | /* execute the "startrecovery" event script on all nodes */
|
---|
1854 | ret = run_startrecovery_eventscript(rec, nodemap);
|
---|
1855 | if (ret!=0) {
|
---|
1856 | DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
|
---|
1857 | return -1;
|
---|
1858 | }
|
---|
1859 |
|
---|
1860 | /* pick a new generation number */
|
---|
1861 | generation = new_generation();
|
---|
1862 |
|
---|
1863 | /* change the vnnmap on this node to use the new generation
|
---|
1864 | number but not on any other nodes.
|
---|
1865 | this guarantees that if we abort the recovery prematurely
|
---|
1866 | for some reason (a node stops responding?)
|
---|
1867 | that we can just return immediately and we will reenter
|
---|
1868 | recovery shortly again.
|
---|
1869 | I.e. we deliberately leave the cluster with an inconsistent
|
---|
1870 | generation id to allow us to abort recovery at any stage and
|
---|
1871 | just restart it from scratch.
|
---|
1872 | */
|
---|
1873 | vnnmap->generation = generation;
|
---|
1874 | ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
|
---|
1875 | if (ret != 0) {
|
---|
1876 | DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
|
---|
1877 | return -1;
|
---|
1878 | }
|
---|
1879 |
|
---|
1880 | /* Database generations are updated when the transaction is commited to
|
---|
1881 | * the databases. So make sure to use the final generation as the
|
---|
1882 | * transaction id
|
---|
1883 | */
|
---|
1884 | generation = new_generation();
|
---|
1885 |
|
---|
1886 | data.dptr = (void *)&generation;
|
---|
1887 | data.dsize = sizeof(uint32_t);
|
---|
1888 |
|
---|
1889 | nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
|
---|
1890 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
|
---|
1891 | nodes, 0,
|
---|
1892 | CONTROL_TIMEOUT(), false, data,
|
---|
1893 | NULL,
|
---|
1894 | transaction_start_fail_callback,
|
---|
1895 | rec) != 0) {
|
---|
1896 | DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
|
---|
1897 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
|
---|
1898 | nodes, 0,
|
---|
1899 | CONTROL_TIMEOUT(), false, tdb_null,
|
---|
1900 | NULL,
|
---|
1901 | NULL,
|
---|
1902 | NULL) != 0) {
|
---|
1903 | DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
|
---|
1904 | }
|
---|
1905 | return -1;
|
---|
1906 | }
|
---|
1907 |
|
---|
1908 | DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
|
---|
1909 |
|
---|
1910 | for (i=0;i<dbmap->num;i++) {
|
---|
1911 | ret = recover_database(rec, mem_ctx,
|
---|
1912 | dbmap->dbs[i].db_id,
|
---|
1913 | dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
|
---|
1914 | pnn, nodemap, generation);
|
---|
1915 | if (ret != 0) {
|
---|
1916 | DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
|
---|
1917 | return -1;
|
---|
1918 | }
|
---|
1919 | }
|
---|
1920 |
|
---|
1921 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
|
---|
1922 |
|
---|
1923 | /* commit all the changes */
|
---|
1924 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
|
---|
1925 | nodes, 0,
|
---|
1926 | CONTROL_TIMEOUT(), false, data,
|
---|
1927 | NULL, NULL,
|
---|
1928 | NULL) != 0) {
|
---|
1929 | DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
|
---|
1930 | return -1;
|
---|
1931 | }
|
---|
1932 |
|
---|
1933 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
|
---|
1934 |
|
---|
1935 | /* build a new vnn map with all the currently active and
|
---|
1936 | unbanned nodes */
|
---|
1937 | vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
|
---|
1938 | CTDB_NO_MEMORY(ctdb, vnnmap);
|
---|
1939 | vnnmap->generation = generation;
|
---|
1940 | vnnmap->size = 0;
|
---|
1941 | vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
|
---|
1942 | CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
---|
1943 | for (i=j=0;i<nodemap->num;i++) {
|
---|
1944 | if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
|
---|
1945 | continue;
|
---|
1946 | }
|
---|
1947 | if (!ctdb_node_has_capabilities(rec->caps,
|
---|
1948 | ctdb->nodes[i]->pnn,
|
---|
1949 | CTDB_CAP_LMASTER)) {
|
---|
1950 | /* this node can not be an lmaster */
|
---|
1951 | DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
|
---|
1952 | continue;
|
---|
1953 | }
|
---|
1954 |
|
---|
1955 | vnnmap->size++;
|
---|
1956 | vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
|
---|
1957 | CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
---|
1958 | vnnmap->map[j++] = nodemap->nodes[i].pnn;
|
---|
1959 |
|
---|
1960 | }
|
---|
1961 | if (vnnmap->size == 0) {
|
---|
1962 | DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
|
---|
1963 | vnnmap->size++;
|
---|
1964 | vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
|
---|
1965 | CTDB_NO_MEMORY(ctdb, vnnmap->map);
|
---|
1966 | vnnmap->map[0] = pnn;
|
---|
1967 | }
|
---|
1968 |
|
---|
1969 | /* update to the new vnnmap on all nodes */
|
---|
1970 | ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
|
---|
1971 | if (ret != 0) {
|
---|
1972 | DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
|
---|
1973 | return -1;
|
---|
1974 | }
|
---|
1975 |
|
---|
1976 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
|
---|
1977 |
|
---|
1978 | /* disable recovery mode */
|
---|
1979 | ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
|
---|
1980 | if (ret != 0) {
|
---|
1981 | DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
|
---|
1982 | return -1;
|
---|
1983 | }
|
---|
1984 |
|
---|
1985 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
|
---|
1986 |
|
---|
1987 | /* execute the "recovered" event script on all nodes */
|
---|
1988 | ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
|
---|
1989 | if (ret!=0) {
|
---|
1990 | DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
|
---|
1991 | return -1;
|
---|
1992 | }
|
---|
1993 |
|
---|
1994 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
|
---|
1995 |
|
---|
1996 | return 0;
|
---|
1997 | }
|
---|
1998 |
|
---|
1999 | /*
|
---|
2000 | we are the recmaster, and recovery is needed - start a recovery run
|
---|
2001 | */
|
---|
2002 | static int do_recovery(struct ctdb_recoverd *rec,
|
---|
2003 | TALLOC_CTX *mem_ctx, uint32_t pnn,
|
---|
2004 | struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
|
---|
2005 | {
|
---|
2006 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2007 | int i, ret;
|
---|
2008 | struct ctdb_dbid_map_old *dbmap;
|
---|
2009 | struct timeval start_time;
|
---|
2010 | bool self_ban;
|
---|
2011 | bool par_recovery;
|
---|
2012 |
|
---|
2013 | DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
|
---|
2014 |
|
---|
2015 | /* Check if the current node is still the recmaster. It's possible that
|
---|
2016 | * re-election has changed the recmaster.
|
---|
2017 | */
|
---|
2018 | if (pnn != rec->recmaster) {
|
---|
2019 | DEBUG(DEBUG_NOTICE,
|
---|
2020 | ("Recovery master changed to %u, aborting recovery\n",
|
---|
2021 | rec->recmaster));
|
---|
2022 | return -1;
|
---|
2023 | }
|
---|
2024 |
|
---|
2025 | /* if recovery fails, force it again */
|
---|
2026 | rec->need_recovery = true;
|
---|
2027 |
|
---|
2028 | if (!ctdb_op_begin(rec->recovery)) {
|
---|
2029 | return -1;
|
---|
2030 | }
|
---|
2031 |
|
---|
2032 | if (rec->election_timeout) {
|
---|
2033 | /* an election is in progress */
|
---|
2034 | DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
|
---|
2035 | goto fail;
|
---|
2036 | }
|
---|
2037 |
|
---|
2038 | ban_misbehaving_nodes(rec, &self_ban);
|
---|
2039 | if (self_ban) {
|
---|
2040 | DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
|
---|
2041 | goto fail;
|
---|
2042 | }
|
---|
2043 |
|
---|
2044 | if (ctdb->recovery_lock_file != NULL) {
|
---|
2045 | if (ctdb_recovery_have_lock(ctdb)) {
|
---|
2046 | DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
|
---|
2047 | } else {
|
---|
2048 | start_time = timeval_current();
|
---|
2049 | DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
|
---|
2050 | ctdb->recovery_lock_file));
|
---|
2051 | if (!ctdb_recovery_lock(ctdb)) {
|
---|
2052 | if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
|
---|
2053 | /* If ctdb is trying first recovery, it's
|
---|
2054 | * possible that current node does not know
|
---|
2055 | * yet who the recmaster is.
|
---|
2056 | */
|
---|
2057 | DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
|
---|
2058 | " - retrying recovery\n"));
|
---|
2059 | goto fail;
|
---|
2060 | }
|
---|
2061 |
|
---|
2062 | DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
|
---|
2063 | "and ban ourself for %u seconds\n",
|
---|
2064 | ctdb->tunable.recovery_ban_period));
|
---|
2065 | ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
|
---|
2066 | goto fail;
|
---|
2067 | }
|
---|
2068 | ctdb_ctrl_report_recd_lock_latency(ctdb,
|
---|
2069 | CONTROL_TIMEOUT(),
|
---|
2070 | timeval_elapsed(&start_time));
|
---|
2071 | DEBUG(DEBUG_NOTICE,
|
---|
2072 | ("Recovery lock taken successfully by recovery daemon\n"));
|
---|
2073 | }
|
---|
2074 | }
|
---|
2075 |
|
---|
2076 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
|
---|
2077 |
|
---|
2078 | /* get a list of all databases */
|
---|
2079 | ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
|
---|
2080 | if (ret != 0) {
|
---|
2081 | DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
|
---|
2082 | goto fail;
|
---|
2083 | }
|
---|
2084 |
|
---|
2085 | /* we do the db creation before we set the recovery mode, so the freeze happens
|
---|
2086 | on all databases we will be dealing with. */
|
---|
2087 |
|
---|
2088 | /* verify that we have all the databases any other node has */
|
---|
2089 | ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
|
---|
2090 | if (ret != 0) {
|
---|
2091 | DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
|
---|
2092 | goto fail;
|
---|
2093 | }
|
---|
2094 |
|
---|
2095 | /* verify that all other nodes have all our databases */
|
---|
2096 | ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
|
---|
2097 | if (ret != 0) {
|
---|
2098 | DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
|
---|
2099 | goto fail;
|
---|
2100 | }
|
---|
2101 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
|
---|
2102 |
|
---|
2103 | /* update the database priority for all remote databases */
|
---|
2104 | ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
|
---|
2105 | if (ret != 0) {
|
---|
2106 | DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
|
---|
2107 | }
|
---|
2108 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
|
---|
2109 |
|
---|
2110 |
|
---|
2111 | /* update all other nodes to use the same setting for reclock files
|
---|
2112 | as the local recovery master.
|
---|
2113 | */
|
---|
2114 | sync_recovery_lock_file_across_cluster(rec);
|
---|
2115 |
|
---|
2116 | /* Retrieve capabilities from all connected nodes */
|
---|
2117 | ret = update_capabilities(rec, nodemap);
|
---|
2118 | if (ret!=0) {
|
---|
2119 | DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
|
---|
2120 | return -1;
|
---|
2121 | }
|
---|
2122 |
|
---|
2123 | /*
|
---|
2124 | update all nodes to have the same flags that we have
|
---|
2125 | */
|
---|
2126 | for (i=0;i<nodemap->num;i++) {
|
---|
2127 | if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
|
---|
2128 | continue;
|
---|
2129 | }
|
---|
2130 |
|
---|
2131 | ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
|
---|
2132 | if (ret != 0) {
|
---|
2133 | if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
|
---|
2134 | DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
|
---|
2135 | } else {
|
---|
2136 | DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
|
---|
2137 | return -1;
|
---|
2138 | }
|
---|
2139 | }
|
---|
2140 | }
|
---|
2141 |
|
---|
2142 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
|
---|
2143 |
|
---|
2144 | /* Check if all participating nodes have parallel recovery capability */
|
---|
2145 | par_recovery = true;
|
---|
2146 | for (i=0; i<nodemap->num; i++) {
|
---|
2147 | if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
|
---|
2148 | continue;
|
---|
2149 | }
|
---|
2150 |
|
---|
2151 | if (!(rec->caps[i].capabilities &
|
---|
2152 | CTDB_CAP_PARALLEL_RECOVERY)) {
|
---|
2153 | par_recovery = false;
|
---|
2154 | break;
|
---|
2155 | }
|
---|
2156 | }
|
---|
2157 |
|
---|
2158 | if (par_recovery) {
|
---|
2159 | ret = db_recovery_parallel(rec, mem_ctx);
|
---|
2160 | } else {
|
---|
2161 | ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
|
---|
2162 | dbmap);
|
---|
2163 | }
|
---|
2164 |
|
---|
2165 | if (ret != 0) {
|
---|
2166 | goto fail;
|
---|
2167 | }
|
---|
2168 |
|
---|
2169 | do_takeover_run(rec, nodemap, false);
|
---|
2170 |
|
---|
2171 | /* send a message to all clients telling them that the cluster
|
---|
2172 | has been reconfigured */
|
---|
2173 | ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
|
---|
2174 | CTDB_SRVID_RECONFIGURE, tdb_null);
|
---|
2175 | if (ret != 0) {
|
---|
2176 | DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
|
---|
2177 | goto fail;
|
---|
2178 | }
|
---|
2179 |
|
---|
2180 | DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
|
---|
2181 |
|
---|
2182 | rec->need_recovery = false;
|
---|
2183 | ctdb_op_end(rec->recovery);
|
---|
2184 |
|
---|
2185 | /* we managed to complete a full recovery, make sure to forgive
|
---|
2186 | any past sins by the nodes that could now participate in the
|
---|
2187 | recovery.
|
---|
2188 | */
|
---|
2189 | DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
|
---|
2190 | for (i=0;i<nodemap->num;i++) {
|
---|
2191 | struct ctdb_banning_state *ban_state;
|
---|
2192 |
|
---|
2193 | if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
|
---|
2194 | continue;
|
---|
2195 | }
|
---|
2196 |
|
---|
2197 | ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
|
---|
2198 | if (ban_state == NULL) {
|
---|
2199 | continue;
|
---|
2200 | }
|
---|
2201 |
|
---|
2202 | ban_state->count = 0;
|
---|
2203 | }
|
---|
2204 |
|
---|
2205 | /* We just finished a recovery successfully.
|
---|
2206 | We now wait for rerecovery_timeout before we allow
|
---|
2207 | another recovery to take place.
|
---|
2208 | */
|
---|
2209 | DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
|
---|
2210 | ctdb_op_disable(rec->recovery, ctdb->ev,
|
---|
2211 | ctdb->tunable.rerecovery_timeout);
|
---|
2212 | return 0;
|
---|
2213 |
|
---|
2214 | fail:
|
---|
2215 | ctdb_op_end(rec->recovery);
|
---|
2216 | return -1;
|
---|
2217 | }
|
---|
2218 |
|
---|
2219 |
|
---|
2220 | /*
|
---|
2221 | elections are won by first checking the number of connected nodes, then
|
---|
2222 | the priority time, then the pnn
|
---|
2223 | */
|
---|
2224 | struct election_message {
|
---|
2225 | uint32_t num_connected;
|
---|
2226 | struct timeval priority_time;
|
---|
2227 | uint32_t pnn;
|
---|
2228 | uint32_t node_flags;
|
---|
2229 | };
|
---|
2230 |
|
---|
2231 | /*
|
---|
2232 | form this nodes election data
|
---|
2233 | */
|
---|
2234 | static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
|
---|
2235 | {
|
---|
2236 | int ret, i;
|
---|
2237 | struct ctdb_node_map_old *nodemap;
|
---|
2238 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2239 |
|
---|
2240 | ZERO_STRUCTP(em);
|
---|
2241 |
|
---|
2242 | em->pnn = rec->ctdb->pnn;
|
---|
2243 | em->priority_time = rec->priority_time;
|
---|
2244 |
|
---|
2245 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
|
---|
2246 | if (ret != 0) {
|
---|
2247 | DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
|
---|
2248 | return;
|
---|
2249 | }
|
---|
2250 |
|
---|
2251 | rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
|
---|
2252 | em->node_flags = rec->node_flags;
|
---|
2253 |
|
---|
2254 | for (i=0;i<nodemap->num;i++) {
|
---|
2255 | if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
|
---|
2256 | em->num_connected++;
|
---|
2257 | }
|
---|
2258 | }
|
---|
2259 |
|
---|
2260 | /* we shouldnt try to win this election if we cant be a recmaster */
|
---|
2261 | if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
|
---|
2262 | em->num_connected = 0;
|
---|
2263 | em->priority_time = timeval_current();
|
---|
2264 | }
|
---|
2265 |
|
---|
2266 | talloc_free(nodemap);
|
---|
2267 | }
|
---|
2268 |
|
---|
2269 | /*
|
---|
2270 | see if the given election data wins
|
---|
2271 | */
|
---|
2272 | static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
|
---|
2273 | {
|
---|
2274 | struct election_message myem;
|
---|
2275 | int cmp = 0;
|
---|
2276 |
|
---|
2277 | ctdb_election_data(rec, &myem);
|
---|
2278 |
|
---|
2279 | /* we cant win if we don't have the recmaster capability */
|
---|
2280 | if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
|
---|
2281 | return false;
|
---|
2282 | }
|
---|
2283 |
|
---|
2284 | /* we cant win if we are banned */
|
---|
2285 | if (rec->node_flags & NODE_FLAGS_BANNED) {
|
---|
2286 | return false;
|
---|
2287 | }
|
---|
2288 |
|
---|
2289 | /* we cant win if we are stopped */
|
---|
2290 | if (rec->node_flags & NODE_FLAGS_STOPPED) {
|
---|
2291 | return false;
|
---|
2292 | }
|
---|
2293 |
|
---|
2294 | /* we will automatically win if the other node is banned */
|
---|
2295 | if (em->node_flags & NODE_FLAGS_BANNED) {
|
---|
2296 | return true;
|
---|
2297 | }
|
---|
2298 |
|
---|
2299 | /* we will automatically win if the other node is banned */
|
---|
2300 | if (em->node_flags & NODE_FLAGS_STOPPED) {
|
---|
2301 | return true;
|
---|
2302 | }
|
---|
2303 |
|
---|
2304 | /* then the longest running node */
|
---|
2305 | if (cmp == 0) {
|
---|
2306 | cmp = timeval_compare(&em->priority_time, &myem.priority_time);
|
---|
2307 | }
|
---|
2308 |
|
---|
2309 | if (cmp == 0) {
|
---|
2310 | cmp = (int)myem.pnn - (int)em->pnn;
|
---|
2311 | }
|
---|
2312 |
|
---|
2313 | return cmp > 0;
|
---|
2314 | }
|
---|
2315 |
|
---|
2316 | /*
|
---|
2317 | send out an election request
|
---|
2318 | */
|
---|
2319 | static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
|
---|
2320 | {
|
---|
2321 | int ret;
|
---|
2322 | TDB_DATA election_data;
|
---|
2323 | struct election_message emsg;
|
---|
2324 | uint64_t srvid;
|
---|
2325 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2326 |
|
---|
2327 | srvid = CTDB_SRVID_ELECTION;
|
---|
2328 |
|
---|
2329 | ctdb_election_data(rec, &emsg);
|
---|
2330 |
|
---|
2331 | election_data.dsize = sizeof(struct election_message);
|
---|
2332 | election_data.dptr = (unsigned char *)&emsg;
|
---|
2333 |
|
---|
2334 |
|
---|
2335 | /* first we assume we will win the election and set
|
---|
2336 | recoverymaster to be ourself on the current node
|
---|
2337 | */
|
---|
2338 | ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
|
---|
2339 | CTDB_CURRENT_NODE, pnn);
|
---|
2340 | if (ret != 0) {
|
---|
2341 | DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
|
---|
2342 | return -1;
|
---|
2343 | }
|
---|
2344 | rec->recmaster = pnn;
|
---|
2345 |
|
---|
2346 | /* send an election message to all active nodes */
|
---|
2347 | DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
|
---|
2348 | return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
|
---|
2349 | }
|
---|
2350 |
|
---|
2351 | /*
|
---|
2352 | we think we are winning the election - send a broadcast election request
|
---|
2353 | */
|
---|
2354 | static void election_send_request(struct tevent_context *ev,
|
---|
2355 | struct tevent_timer *te,
|
---|
2356 | struct timeval t, void *p)
|
---|
2357 | {
|
---|
2358 | struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
|
---|
2359 | int ret;
|
---|
2360 |
|
---|
2361 | ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
|
---|
2362 | if (ret != 0) {
|
---|
2363 | DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
|
---|
2364 | }
|
---|
2365 |
|
---|
2366 | TALLOC_FREE(rec->send_election_te);
|
---|
2367 | }
|
---|
2368 |
|
---|
2369 | /*
|
---|
2370 | handler for memory dumps
|
---|
2371 | */
|
---|
2372 | static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
|
---|
2373 | {
|
---|
2374 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2375 | private_data, struct ctdb_recoverd);
|
---|
2376 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2377 | TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
|
---|
2378 | TDB_DATA *dump;
|
---|
2379 | int ret;
|
---|
2380 | struct ctdb_srvid_message *rd;
|
---|
2381 |
|
---|
2382 | if (data.dsize != sizeof(struct ctdb_srvid_message)) {
|
---|
2383 | DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
|
---|
2384 | talloc_free(tmp_ctx);
|
---|
2385 | return;
|
---|
2386 | }
|
---|
2387 | rd = (struct ctdb_srvid_message *)data.dptr;
|
---|
2388 |
|
---|
2389 | dump = talloc_zero(tmp_ctx, TDB_DATA);
|
---|
2390 | if (dump == NULL) {
|
---|
2391 | DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
|
---|
2392 | talloc_free(tmp_ctx);
|
---|
2393 | return;
|
---|
2394 | }
|
---|
2395 | ret = ctdb_dump_memory(ctdb, dump);
|
---|
2396 | if (ret != 0) {
|
---|
2397 | DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
|
---|
2398 | talloc_free(tmp_ctx);
|
---|
2399 | return;
|
---|
2400 | }
|
---|
2401 |
|
---|
2402 | DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
|
---|
2403 |
|
---|
2404 | ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
|
---|
2405 | if (ret != 0) {
|
---|
2406 | DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
|
---|
2407 | talloc_free(tmp_ctx);
|
---|
2408 | return;
|
---|
2409 | }
|
---|
2410 |
|
---|
2411 | talloc_free(tmp_ctx);
|
---|
2412 | }
|
---|
2413 |
|
---|
2414 | /*
|
---|
2415 | handler for reload_nodes
|
---|
2416 | */
|
---|
2417 | static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
|
---|
2418 | void *private_data)
|
---|
2419 | {
|
---|
2420 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2421 | private_data, struct ctdb_recoverd);
|
---|
2422 |
|
---|
2423 | DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
|
---|
2424 |
|
---|
2425 | ctdb_load_nodes_file(rec->ctdb);
|
---|
2426 | }
|
---|
2427 |
|
---|
2428 |
|
---|
2429 | static void ctdb_rebalance_timeout(struct tevent_context *ev,
|
---|
2430 | struct tevent_timer *te,
|
---|
2431 | struct timeval t, void *p)
|
---|
2432 | {
|
---|
2433 | struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
|
---|
2434 |
|
---|
2435 | if (rec->force_rebalance_nodes == NULL) {
|
---|
2436 | DEBUG(DEBUG_ERR,
|
---|
2437 | ("Rebalance timeout occurred - no nodes to rebalance\n"));
|
---|
2438 | return;
|
---|
2439 | }
|
---|
2440 |
|
---|
2441 | DEBUG(DEBUG_NOTICE,
|
---|
2442 | ("Rebalance timeout occurred - trigger takeover run\n"));
|
---|
2443 | rec->need_takeover_run = true;
|
---|
2444 | }
|
---|
2445 |
|
---|
2446 |
|
---|
2447 | static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
|
---|
2448 | void *private_data)
|
---|
2449 | {
|
---|
2450 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2451 | private_data, struct ctdb_recoverd);
|
---|
2452 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2453 | uint32_t pnn;
|
---|
2454 | uint32_t *t;
|
---|
2455 | int len;
|
---|
2456 | uint32_t deferred_rebalance;
|
---|
2457 |
|
---|
2458 | if (rec->recmaster != ctdb_get_pnn(ctdb)) {
|
---|
2459 | return;
|
---|
2460 | }
|
---|
2461 |
|
---|
2462 | if (data.dsize != sizeof(uint32_t)) {
|
---|
2463 | DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
|
---|
2464 | return;
|
---|
2465 | }
|
---|
2466 |
|
---|
2467 | pnn = *(uint32_t *)&data.dptr[0];
|
---|
2468 |
|
---|
2469 | DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
|
---|
2470 |
|
---|
2471 | /* Copy any existing list of nodes. There's probably some
|
---|
2472 | * sort of realloc variant that will do this but we need to
|
---|
2473 | * make sure that freeing the old array also cancels the timer
|
---|
2474 | * event for the timeout... not sure if realloc will do that.
|
---|
2475 | */
|
---|
2476 | len = (rec->force_rebalance_nodes != NULL) ?
|
---|
2477 | talloc_array_length(rec->force_rebalance_nodes) :
|
---|
2478 | 0;
|
---|
2479 |
|
---|
2480 | /* This allows duplicates to be added but they don't cause
|
---|
2481 | * harm. A call to add a duplicate PNN arguably means that
|
---|
2482 | * the timeout should be reset, so this is the simplest
|
---|
2483 | * solution.
|
---|
2484 | */
|
---|
2485 | t = talloc_zero_array(rec, uint32_t, len+1);
|
---|
2486 | CTDB_NO_MEMORY_VOID(ctdb, t);
|
---|
2487 | if (len > 0) {
|
---|
2488 | memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
|
---|
2489 | }
|
---|
2490 | t[len] = pnn;
|
---|
2491 |
|
---|
2492 | talloc_free(rec->force_rebalance_nodes);
|
---|
2493 |
|
---|
2494 | rec->force_rebalance_nodes = t;
|
---|
2495 |
|
---|
2496 | /* If configured, setup a deferred takeover run to make sure
|
---|
2497 | * that certain nodes get IPs rebalanced to them. This will
|
---|
2498 | * be cancelled if a successful takeover run happens before
|
---|
2499 | * the timeout. Assign tunable value to variable for
|
---|
2500 | * readability.
|
---|
2501 | */
|
---|
2502 | deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
|
---|
2503 | if (deferred_rebalance != 0) {
|
---|
2504 | tevent_add_timer(ctdb->ev, rec->force_rebalance_nodes,
|
---|
2505 | timeval_current_ofs(deferred_rebalance, 0),
|
---|
2506 | ctdb_rebalance_timeout, rec);
|
---|
2507 | }
|
---|
2508 | }
|
---|
2509 |
|
---|
2510 |
|
---|
2511 |
|
---|
2512 | static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
|
---|
2513 | void *private_data)
|
---|
2514 | {
|
---|
2515 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2516 | private_data, struct ctdb_recoverd);
|
---|
2517 | struct ctdb_public_ip *ip;
|
---|
2518 |
|
---|
2519 | if (rec->recmaster != rec->ctdb->pnn) {
|
---|
2520 | DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
|
---|
2521 | return;
|
---|
2522 | }
|
---|
2523 |
|
---|
2524 | if (data.dsize != sizeof(struct ctdb_public_ip)) {
|
---|
2525 | DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
|
---|
2526 | return;
|
---|
2527 | }
|
---|
2528 |
|
---|
2529 | ip = (struct ctdb_public_ip *)data.dptr;
|
---|
2530 |
|
---|
2531 | update_ip_assignment_tree(rec->ctdb, ip);
|
---|
2532 | }
|
---|
2533 |
|
---|
2534 | static void srvid_disable_and_reply(struct ctdb_context *ctdb,
|
---|
2535 | TDB_DATA data,
|
---|
2536 | struct ctdb_op_state *op_state)
|
---|
2537 | {
|
---|
2538 | struct ctdb_disable_message *r;
|
---|
2539 | uint32_t timeout;
|
---|
2540 | TDB_DATA result;
|
---|
2541 | int32_t ret = 0;
|
---|
2542 |
|
---|
2543 | /* Validate input data */
|
---|
2544 | if (data.dsize != sizeof(struct ctdb_disable_message)) {
|
---|
2545 | DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
|
---|
2546 | "expecting %lu\n", (long unsigned)data.dsize,
|
---|
2547 | (long unsigned)sizeof(struct ctdb_srvid_message)));
|
---|
2548 | return;
|
---|
2549 | }
|
---|
2550 | if (data.dptr == NULL) {
|
---|
2551 | DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
|
---|
2552 | return;
|
---|
2553 | }
|
---|
2554 |
|
---|
2555 | r = (struct ctdb_disable_message *)data.dptr;
|
---|
2556 | timeout = r->timeout;
|
---|
2557 |
|
---|
2558 | ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
|
---|
2559 | if (ret != 0) {
|
---|
2560 | goto done;
|
---|
2561 | }
|
---|
2562 |
|
---|
2563 | /* Returning our PNN tells the caller that we succeeded */
|
---|
2564 | ret = ctdb_get_pnn(ctdb);
|
---|
2565 | done:
|
---|
2566 | result.dsize = sizeof(int32_t);
|
---|
2567 | result.dptr = (uint8_t *)&ret;
|
---|
2568 | srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
|
---|
2569 | }
|
---|
2570 |
|
---|
2571 | static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
|
---|
2572 | void *private_data)
|
---|
2573 | {
|
---|
2574 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2575 | private_data, struct ctdb_recoverd);
|
---|
2576 |
|
---|
2577 | srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
|
---|
2578 | }
|
---|
2579 |
|
---|
2580 | /* Backward compatibility for this SRVID */
|
---|
2581 | static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
|
---|
2582 | void *private_data)
|
---|
2583 | {
|
---|
2584 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2585 | private_data, struct ctdb_recoverd);
|
---|
2586 | uint32_t timeout;
|
---|
2587 |
|
---|
2588 | if (data.dsize != sizeof(uint32_t)) {
|
---|
2589 | DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
|
---|
2590 | "expecting %lu\n", (long unsigned)data.dsize,
|
---|
2591 | (long unsigned)sizeof(uint32_t)));
|
---|
2592 | return;
|
---|
2593 | }
|
---|
2594 | if (data.dptr == NULL) {
|
---|
2595 | DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
|
---|
2596 | return;
|
---|
2597 | }
|
---|
2598 |
|
---|
2599 | timeout = *((uint32_t *)data.dptr);
|
---|
2600 |
|
---|
2601 | ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
|
---|
2602 | }
|
---|
2603 |
|
---|
2604 | static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
|
---|
2605 | void *private_data)
|
---|
2606 | {
|
---|
2607 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2608 | private_data, struct ctdb_recoverd);
|
---|
2609 |
|
---|
2610 | srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
|
---|
2611 | }
|
---|
2612 |
|
---|
2613 | /*
|
---|
2614 | handler for ip reallocate, just add it to the list of requests and
|
---|
2615 | handle this later in the monitor_cluster loop so we do not recurse
|
---|
2616 | with other requests to takeover_run()
|
---|
2617 | */
|
---|
2618 | static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
|
---|
2619 | void *private_data)
|
---|
2620 | {
|
---|
2621 | struct ctdb_srvid_message *request;
|
---|
2622 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2623 | private_data, struct ctdb_recoverd);
|
---|
2624 |
|
---|
2625 | if (data.dsize != sizeof(struct ctdb_srvid_message)) {
|
---|
2626 | DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
|
---|
2627 | return;
|
---|
2628 | }
|
---|
2629 |
|
---|
2630 | request = (struct ctdb_srvid_message *)data.dptr;
|
---|
2631 |
|
---|
2632 | srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
|
---|
2633 | }
|
---|
2634 |
|
---|
2635 | static void process_ipreallocate_requests(struct ctdb_context *ctdb,
|
---|
2636 | struct ctdb_recoverd *rec)
|
---|
2637 | {
|
---|
2638 | TDB_DATA result;
|
---|
2639 | int32_t ret;
|
---|
2640 | struct srvid_requests *current;
|
---|
2641 |
|
---|
2642 | DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
|
---|
2643 |
|
---|
2644 | /* Only process requests that are currently pending. More
|
---|
2645 | * might come in while the takeover run is in progress and
|
---|
2646 | * they will need to be processed later since they might
|
---|
2647 | * be in response flag changes.
|
---|
2648 | */
|
---|
2649 | current = rec->reallocate_requests;
|
---|
2650 | rec->reallocate_requests = NULL;
|
---|
2651 |
|
---|
2652 | if (do_takeover_run(rec, rec->nodemap, false)) {
|
---|
2653 | ret = ctdb_get_pnn(ctdb);
|
---|
2654 | } else {
|
---|
2655 | ret = -1;
|
---|
2656 | }
|
---|
2657 |
|
---|
2658 | result.dsize = sizeof(int32_t);
|
---|
2659 | result.dptr = (uint8_t *)&ret;
|
---|
2660 |
|
---|
2661 | srvid_requests_reply(ctdb, ¤t, result);
|
---|
2662 | }
|
---|
2663 |
|
---|
2664 | /*
|
---|
2665 | * handler for assigning banning credits
|
---|
2666 | */
|
---|
2667 | static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
|
---|
2668 | {
|
---|
2669 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2670 | private_data, struct ctdb_recoverd);
|
---|
2671 | uint32_t ban_pnn;
|
---|
2672 |
|
---|
2673 | /* Ignore if we are not recmaster */
|
---|
2674 | if (rec->ctdb->pnn != rec->recmaster) {
|
---|
2675 | return;
|
---|
2676 | }
|
---|
2677 |
|
---|
2678 | if (data.dsize != sizeof(uint32_t)) {
|
---|
2679 | DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
|
---|
2680 | data.dsize));
|
---|
2681 | return;
|
---|
2682 | }
|
---|
2683 |
|
---|
2684 | ban_pnn = *(uint32_t *)data.dptr;
|
---|
2685 |
|
---|
2686 | ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
|
---|
2687 | }
|
---|
2688 |
|
---|
2689 | /*
|
---|
2690 | handler for recovery master elections
|
---|
2691 | */
|
---|
2692 | static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
|
---|
2693 | {
|
---|
2694 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2695 | private_data, struct ctdb_recoverd);
|
---|
2696 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2697 | int ret;
|
---|
2698 | struct election_message *em = (struct election_message *)data.dptr;
|
---|
2699 |
|
---|
2700 | /* Ignore election packets from ourself */
|
---|
2701 | if (ctdb->pnn == em->pnn) {
|
---|
2702 | return;
|
---|
2703 | }
|
---|
2704 |
|
---|
2705 | /* we got an election packet - update the timeout for the election */
|
---|
2706 | talloc_free(rec->election_timeout);
|
---|
2707 | rec->election_timeout = tevent_add_timer(
|
---|
2708 | ctdb->ev, ctdb,
|
---|
2709 | fast_start ?
|
---|
2710 | timeval_current_ofs(0, 500000) :
|
---|
2711 | timeval_current_ofs(ctdb->tunable.election_timeout, 0),
|
---|
2712 | ctdb_election_timeout, rec);
|
---|
2713 |
|
---|
2714 | /* someone called an election. check their election data
|
---|
2715 | and if we disagree and we would rather be the elected node,
|
---|
2716 | send a new election message to all other nodes
|
---|
2717 | */
|
---|
2718 | if (ctdb_election_win(rec, em)) {
|
---|
2719 | if (!rec->send_election_te) {
|
---|
2720 | rec->send_election_te = tevent_add_timer(
|
---|
2721 | ctdb->ev, rec,
|
---|
2722 | timeval_current_ofs(0, 500000),
|
---|
2723 | election_send_request, rec);
|
---|
2724 | }
|
---|
2725 | return;
|
---|
2726 | }
|
---|
2727 |
|
---|
2728 | /* we didn't win */
|
---|
2729 | TALLOC_FREE(rec->send_election_te);
|
---|
2730 |
|
---|
2731 | /* Release the recovery lock file */
|
---|
2732 | if (ctdb_recovery_have_lock(ctdb)) {
|
---|
2733 | ctdb_recovery_unlock(ctdb);
|
---|
2734 | }
|
---|
2735 |
|
---|
2736 | clear_ip_assignment_tree(ctdb);
|
---|
2737 |
|
---|
2738 | /* ok, let that guy become recmaster then */
|
---|
2739 | ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
|
---|
2740 | CTDB_CURRENT_NODE, em->pnn);
|
---|
2741 | if (ret != 0) {
|
---|
2742 | DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
|
---|
2743 | return;
|
---|
2744 | }
|
---|
2745 | rec->recmaster = em->pnn;
|
---|
2746 |
|
---|
2747 | return;
|
---|
2748 | }
|
---|
2749 |
|
---|
2750 |
|
---|
2751 | /*
|
---|
2752 | force the start of the election process
|
---|
2753 | */
|
---|
2754 | static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
|
---|
2755 | struct ctdb_node_map_old *nodemap)
|
---|
2756 | {
|
---|
2757 | int ret;
|
---|
2758 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2759 |
|
---|
2760 | DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
|
---|
2761 |
|
---|
2762 | /* set all nodes to recovery mode to stop all internode traffic */
|
---|
2763 | ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
|
---|
2764 | if (ret != 0) {
|
---|
2765 | DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
|
---|
2766 | return;
|
---|
2767 | }
|
---|
2768 |
|
---|
2769 | talloc_free(rec->election_timeout);
|
---|
2770 | rec->election_timeout = tevent_add_timer(
|
---|
2771 | ctdb->ev, ctdb,
|
---|
2772 | fast_start ?
|
---|
2773 | timeval_current_ofs(0, 500000) :
|
---|
2774 | timeval_current_ofs(ctdb->tunable.election_timeout, 0),
|
---|
2775 | ctdb_election_timeout, rec);
|
---|
2776 |
|
---|
2777 | ret = send_election_request(rec, pnn);
|
---|
2778 | if (ret!=0) {
|
---|
2779 | DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
|
---|
2780 | return;
|
---|
2781 | }
|
---|
2782 |
|
---|
2783 | /* wait for a few seconds to collect all responses */
|
---|
2784 | ctdb_wait_election(rec);
|
---|
2785 | }
|
---|
2786 |
|
---|
2787 |
|
---|
2788 |
|
---|
2789 | /*
|
---|
2790 | handler for when a node changes its flags
|
---|
2791 | */
|
---|
2792 | static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
|
---|
2793 | {
|
---|
2794 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2795 | private_data, struct ctdb_recoverd);
|
---|
2796 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2797 | int ret;
|
---|
2798 | struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
|
---|
2799 | struct ctdb_node_map_old *nodemap=NULL;
|
---|
2800 | TALLOC_CTX *tmp_ctx;
|
---|
2801 | int i;
|
---|
2802 | int disabled_flag_changed;
|
---|
2803 |
|
---|
2804 | if (data.dsize != sizeof(*c)) {
|
---|
2805 | DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
|
---|
2806 | return;
|
---|
2807 | }
|
---|
2808 |
|
---|
2809 | tmp_ctx = talloc_new(ctdb);
|
---|
2810 | CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
|
---|
2811 |
|
---|
2812 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
|
---|
2813 | if (ret != 0) {
|
---|
2814 | DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
|
---|
2815 | talloc_free(tmp_ctx);
|
---|
2816 | return;
|
---|
2817 | }
|
---|
2818 |
|
---|
2819 |
|
---|
2820 | for (i=0;i<nodemap->num;i++) {
|
---|
2821 | if (nodemap->nodes[i].pnn == c->pnn) break;
|
---|
2822 | }
|
---|
2823 |
|
---|
2824 | if (i == nodemap->num) {
|
---|
2825 | DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
|
---|
2826 | talloc_free(tmp_ctx);
|
---|
2827 | return;
|
---|
2828 | }
|
---|
2829 |
|
---|
2830 | if (c->old_flags != c->new_flags) {
|
---|
2831 | DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
|
---|
2832 | }
|
---|
2833 |
|
---|
2834 | disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
|
---|
2835 |
|
---|
2836 | nodemap->nodes[i].flags = c->new_flags;
|
---|
2837 |
|
---|
2838 | ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
|
---|
2839 | CTDB_CURRENT_NODE, &ctdb->recovery_mode);
|
---|
2840 |
|
---|
2841 | if (ret == 0 &&
|
---|
2842 | rec->recmaster == ctdb->pnn &&
|
---|
2843 | ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
|
---|
2844 | /* Only do the takeover run if the perm disabled or unhealthy
|
---|
2845 | flags changed since these will cause an ip failover but not
|
---|
2846 | a recovery.
|
---|
2847 | If the node became disconnected or banned this will also
|
---|
2848 | lead to an ip address failover but that is handled
|
---|
2849 | during recovery
|
---|
2850 | */
|
---|
2851 | if (disabled_flag_changed) {
|
---|
2852 | rec->need_takeover_run = true;
|
---|
2853 | }
|
---|
2854 | }
|
---|
2855 |
|
---|
2856 | talloc_free(tmp_ctx);
|
---|
2857 | }
|
---|
2858 |
|
---|
2859 | /*
|
---|
2860 | handler for when we need to push out flag changes ot all other nodes
|
---|
2861 | */
|
---|
2862 | static void push_flags_handler(uint64_t srvid, TDB_DATA data,
|
---|
2863 | void *private_data)
|
---|
2864 | {
|
---|
2865 | struct ctdb_recoverd *rec = talloc_get_type(
|
---|
2866 | private_data, struct ctdb_recoverd);
|
---|
2867 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
2868 | int ret;
|
---|
2869 | struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
|
---|
2870 | struct ctdb_node_map_old *nodemap=NULL;
|
---|
2871 | TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
|
---|
2872 | uint32_t *nodes;
|
---|
2873 |
|
---|
2874 | /* read the node flags from the recmaster */
|
---|
2875 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
|
---|
2876 | tmp_ctx, &nodemap);
|
---|
2877 | if (ret != 0) {
|
---|
2878 | DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
|
---|
2879 | talloc_free(tmp_ctx);
|
---|
2880 | return;
|
---|
2881 | }
|
---|
2882 | if (c->pnn >= nodemap->num) {
|
---|
2883 | DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
|
---|
2884 | talloc_free(tmp_ctx);
|
---|
2885 | return;
|
---|
2886 | }
|
---|
2887 |
|
---|
2888 | /* send the flags update to all connected nodes */
|
---|
2889 | nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
|
---|
2890 |
|
---|
2891 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
|
---|
2892 | nodes, 0, CONTROL_TIMEOUT(),
|
---|
2893 | false, data,
|
---|
2894 | NULL, NULL,
|
---|
2895 | NULL) != 0) {
|
---|
2896 | DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
|
---|
2897 |
|
---|
2898 | talloc_free(tmp_ctx);
|
---|
2899 | return;
|
---|
2900 | }
|
---|
2901 |
|
---|
2902 | talloc_free(tmp_ctx);
|
---|
2903 | }
|
---|
2904 |
|
---|
2905 |
|
---|
2906 | struct verify_recmode_normal_data {
|
---|
2907 | uint32_t count;
|
---|
2908 | enum monitor_result status;
|
---|
2909 | };
|
---|
2910 |
|
---|
2911 | static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
|
---|
2912 | {
|
---|
2913 | struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
|
---|
2914 |
|
---|
2915 |
|
---|
2916 | /* one more node has responded with recmode data*/
|
---|
2917 | rmdata->count--;
|
---|
2918 |
|
---|
2919 | /* if we failed to get the recmode, then return an error and let
|
---|
2920 | the main loop try again.
|
---|
2921 | */
|
---|
2922 | if (state->state != CTDB_CONTROL_DONE) {
|
---|
2923 | if (rmdata->status == MONITOR_OK) {
|
---|
2924 | rmdata->status = MONITOR_FAILED;
|
---|
2925 | }
|
---|
2926 | return;
|
---|
2927 | }
|
---|
2928 |
|
---|
2929 | /* if we got a response, then the recmode will be stored in the
|
---|
2930 | status field
|
---|
2931 | */
|
---|
2932 | if (state->status != CTDB_RECOVERY_NORMAL) {
|
---|
2933 | DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
|
---|
2934 | rmdata->status = MONITOR_RECOVERY_NEEDED;
|
---|
2935 | }
|
---|
2936 |
|
---|
2937 | return;
|
---|
2938 | }
|
---|
2939 |
|
---|
2940 |
|
---|
2941 | /* verify that all nodes are in normal recovery mode */
|
---|
2942 | static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
|
---|
2943 | {
|
---|
2944 | struct verify_recmode_normal_data *rmdata;
|
---|
2945 | TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
---|
2946 | struct ctdb_client_control_state *state;
|
---|
2947 | enum monitor_result status;
|
---|
2948 | int j;
|
---|
2949 |
|
---|
2950 | rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
|
---|
2951 | CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
|
---|
2952 | rmdata->count = 0;
|
---|
2953 | rmdata->status = MONITOR_OK;
|
---|
2954 |
|
---|
2955 | /* loop over all active nodes and send an async getrecmode call to
|
---|
2956 | them*/
|
---|
2957 | for (j=0; j<nodemap->num; j++) {
|
---|
2958 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
2959 | continue;
|
---|
2960 | }
|
---|
2961 | state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
|
---|
2962 | CONTROL_TIMEOUT(),
|
---|
2963 | nodemap->nodes[j].pnn);
|
---|
2964 | if (state == NULL) {
|
---|
2965 | /* we failed to send the control, treat this as
|
---|
2966 | an error and try again next iteration
|
---|
2967 | */
|
---|
2968 | DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
|
---|
2969 | talloc_free(mem_ctx);
|
---|
2970 | return MONITOR_FAILED;
|
---|
2971 | }
|
---|
2972 |
|
---|
2973 | /* set up the callback functions */
|
---|
2974 | state->async.fn = verify_recmode_normal_callback;
|
---|
2975 | state->async.private_data = rmdata;
|
---|
2976 |
|
---|
2977 | /* one more control to wait for to complete */
|
---|
2978 | rmdata->count++;
|
---|
2979 | }
|
---|
2980 |
|
---|
2981 |
|
---|
2982 | /* now wait for up to the maximum number of seconds allowed
|
---|
2983 | or until all nodes we expect a response from has replied
|
---|
2984 | */
|
---|
2985 | while (rmdata->count > 0) {
|
---|
2986 | tevent_loop_once(ctdb->ev);
|
---|
2987 | }
|
---|
2988 |
|
---|
2989 | status = rmdata->status;
|
---|
2990 | talloc_free(mem_ctx);
|
---|
2991 | return status;
|
---|
2992 | }
|
---|
2993 |
|
---|
2994 |
|
---|
2995 | struct verify_recmaster_data {
|
---|
2996 | struct ctdb_recoverd *rec;
|
---|
2997 | uint32_t count;
|
---|
2998 | uint32_t pnn;
|
---|
2999 | enum monitor_result status;
|
---|
3000 | };
|
---|
3001 |
|
---|
3002 | static void verify_recmaster_callback(struct ctdb_client_control_state *state)
|
---|
3003 | {
|
---|
3004 | struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
|
---|
3005 |
|
---|
3006 |
|
---|
3007 | /* one more node has responded with recmaster data*/
|
---|
3008 | rmdata->count--;
|
---|
3009 |
|
---|
3010 | /* if we failed to get the recmaster, then return an error and let
|
---|
3011 | the main loop try again.
|
---|
3012 | */
|
---|
3013 | if (state->state != CTDB_CONTROL_DONE) {
|
---|
3014 | if (rmdata->status == MONITOR_OK) {
|
---|
3015 | rmdata->status = MONITOR_FAILED;
|
---|
3016 | }
|
---|
3017 | return;
|
---|
3018 | }
|
---|
3019 |
|
---|
3020 | /* if we got a response, then the recmaster will be stored in the
|
---|
3021 | status field
|
---|
3022 | */
|
---|
3023 | if (state->status != rmdata->pnn) {
|
---|
3024 | DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
|
---|
3025 | ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
|
---|
3026 | rmdata->status = MONITOR_ELECTION_NEEDED;
|
---|
3027 | }
|
---|
3028 |
|
---|
3029 | return;
|
---|
3030 | }
|
---|
3031 |
|
---|
3032 |
|
---|
3033 | /* verify that all nodes agree that we are the recmaster */
|
---|
3034 | static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
|
---|
3035 | {
|
---|
3036 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
3037 | struct verify_recmaster_data *rmdata;
|
---|
3038 | TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
---|
3039 | struct ctdb_client_control_state *state;
|
---|
3040 | enum monitor_result status;
|
---|
3041 | int j;
|
---|
3042 |
|
---|
3043 | rmdata = talloc(mem_ctx, struct verify_recmaster_data);
|
---|
3044 | CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
|
---|
3045 | rmdata->rec = rec;
|
---|
3046 | rmdata->count = 0;
|
---|
3047 | rmdata->pnn = pnn;
|
---|
3048 | rmdata->status = MONITOR_OK;
|
---|
3049 |
|
---|
3050 | /* loop over all active nodes and send an async getrecmaster call to
|
---|
3051 | them*/
|
---|
3052 | for (j=0; j<nodemap->num; j++) {
|
---|
3053 | if (nodemap->nodes[j].pnn == rec->recmaster) {
|
---|
3054 | continue;
|
---|
3055 | }
|
---|
3056 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3057 | continue;
|
---|
3058 | }
|
---|
3059 | state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
|
---|
3060 | CONTROL_TIMEOUT(),
|
---|
3061 | nodemap->nodes[j].pnn);
|
---|
3062 | if (state == NULL) {
|
---|
3063 | /* we failed to send the control, treat this as
|
---|
3064 | an error and try again next iteration
|
---|
3065 | */
|
---|
3066 | DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
|
---|
3067 | talloc_free(mem_ctx);
|
---|
3068 | return MONITOR_FAILED;
|
---|
3069 | }
|
---|
3070 |
|
---|
3071 | /* set up the callback functions */
|
---|
3072 | state->async.fn = verify_recmaster_callback;
|
---|
3073 | state->async.private_data = rmdata;
|
---|
3074 |
|
---|
3075 | /* one more control to wait for to complete */
|
---|
3076 | rmdata->count++;
|
---|
3077 | }
|
---|
3078 |
|
---|
3079 |
|
---|
3080 | /* now wait for up to the maximum number of seconds allowed
|
---|
3081 | or until all nodes we expect a response from has replied
|
---|
3082 | */
|
---|
3083 | while (rmdata->count > 0) {
|
---|
3084 | tevent_loop_once(ctdb->ev);
|
---|
3085 | }
|
---|
3086 |
|
---|
3087 | status = rmdata->status;
|
---|
3088 | talloc_free(mem_ctx);
|
---|
3089 | return status;
|
---|
3090 | }
|
---|
3091 |
|
---|
3092 | static bool interfaces_have_changed(struct ctdb_context *ctdb,
|
---|
3093 | struct ctdb_recoverd *rec)
|
---|
3094 | {
|
---|
3095 | struct ctdb_iface_list_old *ifaces = NULL;
|
---|
3096 | TALLOC_CTX *mem_ctx;
|
---|
3097 | bool ret = false;
|
---|
3098 |
|
---|
3099 | mem_ctx = talloc_new(NULL);
|
---|
3100 |
|
---|
3101 | /* Read the interfaces from the local node */
|
---|
3102 | if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
|
---|
3103 | CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
|
---|
3104 | DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
|
---|
3105 | /* We could return an error. However, this will be
|
---|
3106 | * rare so we'll decide that the interfaces have
|
---|
3107 | * actually changed, just in case.
|
---|
3108 | */
|
---|
3109 | talloc_free(mem_ctx);
|
---|
3110 | return true;
|
---|
3111 | }
|
---|
3112 |
|
---|
3113 | if (!rec->ifaces) {
|
---|
3114 | /* We haven't been here before so things have changed */
|
---|
3115 | DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
|
---|
3116 | ret = true;
|
---|
3117 | } else if (rec->ifaces->num != ifaces->num) {
|
---|
3118 | /* Number of interfaces has changed */
|
---|
3119 | DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
|
---|
3120 | rec->ifaces->num, ifaces->num));
|
---|
3121 | ret = true;
|
---|
3122 | } else {
|
---|
3123 | /* See if interface names or link states have changed */
|
---|
3124 | int i;
|
---|
3125 | for (i = 0; i < rec->ifaces->num; i++) {
|
---|
3126 | struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
|
---|
3127 | if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
|
---|
3128 | DEBUG(DEBUG_NOTICE,
|
---|
3129 | ("Interface in slot %d changed: %s => %s\n",
|
---|
3130 | i, iface->name, ifaces->ifaces[i].name));
|
---|
3131 | ret = true;
|
---|
3132 | break;
|
---|
3133 | }
|
---|
3134 | if (iface->link_state != ifaces->ifaces[i].link_state) {
|
---|
3135 | DEBUG(DEBUG_NOTICE,
|
---|
3136 | ("Interface %s changed state: %d => %d\n",
|
---|
3137 | iface->name, iface->link_state,
|
---|
3138 | ifaces->ifaces[i].link_state));
|
---|
3139 | ret = true;
|
---|
3140 | break;
|
---|
3141 | }
|
---|
3142 | }
|
---|
3143 | }
|
---|
3144 |
|
---|
3145 | talloc_free(rec->ifaces);
|
---|
3146 | rec->ifaces = talloc_steal(rec, ifaces);
|
---|
3147 |
|
---|
3148 | talloc_free(mem_ctx);
|
---|
3149 | return ret;
|
---|
3150 | }
|
---|
3151 |
|
---|
3152 | /* called to check that the local allocation of public ip addresses is ok.
|
---|
3153 | */
|
---|
3154 | static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map_old *nodemap)
|
---|
3155 | {
|
---|
3156 | TALLOC_CTX *mem_ctx = talloc_new(NULL);
|
---|
3157 | int ret, j;
|
---|
3158 | bool need_takeover_run = false;
|
---|
3159 |
|
---|
3160 | if (interfaces_have_changed(ctdb, rec)) {
|
---|
3161 | DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
|
---|
3162 | "local node %u - force takeover run\n",
|
---|
3163 | pnn));
|
---|
3164 | need_takeover_run = true;
|
---|
3165 | }
|
---|
3166 |
|
---|
3167 | /* verify that we have the ip addresses we should have
|
---|
3168 | and we don't have ones we shouldnt have.
|
---|
3169 | if we find an inconsistency we set recmode to
|
---|
3170 | active on the local node and wait for the recmaster
|
---|
3171 | to do a full blown recovery.
|
---|
3172 | also if the pnn is -1 and we are healthy and can host the ip
|
---|
3173 | we also request a ip reallocation.
|
---|
3174 | */
|
---|
3175 | if (ctdb->tunable.disable_ip_failover == 0) {
|
---|
3176 | struct ctdb_public_ip_list_old *ips = NULL;
|
---|
3177 |
|
---|
3178 | /* read the *available* IPs from the local node */
|
---|
3179 | ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
|
---|
3180 | if (ret != 0) {
|
---|
3181 | DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
|
---|
3182 | talloc_free(mem_ctx);
|
---|
3183 | return -1;
|
---|
3184 | }
|
---|
3185 |
|
---|
3186 | for (j=0; j<ips->num; j++) {
|
---|
3187 | if (ips->ips[j].pnn == -1 &&
|
---|
3188 | nodemap->nodes[pnn].flags == 0) {
|
---|
3189 | DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
|
---|
3190 | ctdb_addr_to_str(&ips->ips[j].addr)));
|
---|
3191 | need_takeover_run = true;
|
---|
3192 | }
|
---|
3193 | }
|
---|
3194 |
|
---|
3195 | talloc_free(ips);
|
---|
3196 |
|
---|
3197 | /* read the *known* IPs from the local node */
|
---|
3198 | ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
|
---|
3199 | if (ret != 0) {
|
---|
3200 | DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
|
---|
3201 | talloc_free(mem_ctx);
|
---|
3202 | return -1;
|
---|
3203 | }
|
---|
3204 |
|
---|
3205 | for (j=0; j<ips->num; j++) {
|
---|
3206 | if (ips->ips[j].pnn == pnn) {
|
---|
3207 | if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
|
---|
3208 | DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
|
---|
3209 | ctdb_addr_to_str(&ips->ips[j].addr)));
|
---|
3210 | need_takeover_run = true;
|
---|
3211 | }
|
---|
3212 | } else {
|
---|
3213 | if (ctdb->do_checkpublicip &&
|
---|
3214 | ctdb_sys_have_ip(&ips->ips[j].addr)) {
|
---|
3215 |
|
---|
3216 | DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
|
---|
3217 | ctdb_addr_to_str(&ips->ips[j].addr)));
|
---|
3218 |
|
---|
3219 | if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
|
---|
3220 | DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
|
---|
3221 | }
|
---|
3222 | }
|
---|
3223 | }
|
---|
3224 | }
|
---|
3225 | }
|
---|
3226 |
|
---|
3227 | if (need_takeover_run) {
|
---|
3228 | struct ctdb_srvid_message rd;
|
---|
3229 | TDB_DATA data;
|
---|
3230 |
|
---|
3231 | DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
|
---|
3232 |
|
---|
3233 | ZERO_STRUCT(rd);
|
---|
3234 | rd.pnn = ctdb->pnn;
|
---|
3235 | rd.srvid = 0;
|
---|
3236 | data.dptr = (uint8_t *)&rd;
|
---|
3237 | data.dsize = sizeof(rd);
|
---|
3238 |
|
---|
3239 | ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
|
---|
3240 | if (ret != 0) {
|
---|
3241 | DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
|
---|
3242 | }
|
---|
3243 | }
|
---|
3244 | talloc_free(mem_ctx);
|
---|
3245 | return 0;
|
---|
3246 | }
|
---|
3247 |
|
---|
3248 |
|
---|
3249 | static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
|
---|
3250 | {
|
---|
3251 | struct ctdb_node_map_old **remote_nodemaps = callback_data;
|
---|
3252 |
|
---|
3253 | if (node_pnn >= ctdb->num_nodes) {
|
---|
3254 | DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
|
---|
3255 | return;
|
---|
3256 | }
|
---|
3257 |
|
---|
3258 | remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
|
---|
3259 |
|
---|
3260 | }
|
---|
3261 |
|
---|
3262 | static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
|
---|
3263 | struct ctdb_node_map_old *nodemap,
|
---|
3264 | struct ctdb_node_map_old **remote_nodemaps)
|
---|
3265 | {
|
---|
3266 | uint32_t *nodes;
|
---|
3267 |
|
---|
3268 | nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
|
---|
3269 | if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
|
---|
3270 | nodes, 0,
|
---|
3271 | CONTROL_TIMEOUT(), false, tdb_null,
|
---|
3272 | async_getnodemap_callback,
|
---|
3273 | NULL,
|
---|
3274 | remote_nodemaps) != 0) {
|
---|
3275 | DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
|
---|
3276 |
|
---|
3277 | return -1;
|
---|
3278 | }
|
---|
3279 |
|
---|
3280 | return 0;
|
---|
3281 | }
|
---|
3282 |
|
---|
3283 | static int update_recovery_lock_file(struct ctdb_context *ctdb)
|
---|
3284 | {
|
---|
3285 | TALLOC_CTX *tmp_ctx = talloc_new(NULL);
|
---|
3286 | const char *reclockfile;
|
---|
3287 |
|
---|
3288 | if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
|
---|
3289 | DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
|
---|
3290 | talloc_free(tmp_ctx);
|
---|
3291 | return -1;
|
---|
3292 | }
|
---|
3293 |
|
---|
3294 | if (reclockfile == NULL) {
|
---|
3295 | if (ctdb->recovery_lock_file != NULL) {
|
---|
3296 | DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
|
---|
3297 | talloc_free(ctdb->recovery_lock_file);
|
---|
3298 | ctdb->recovery_lock_file = NULL;
|
---|
3299 | ctdb_recovery_unlock(ctdb);
|
---|
3300 | }
|
---|
3301 | talloc_free(tmp_ctx);
|
---|
3302 | return 0;
|
---|
3303 | }
|
---|
3304 |
|
---|
3305 | if (ctdb->recovery_lock_file == NULL) {
|
---|
3306 | DEBUG(DEBUG_NOTICE,
|
---|
3307 | ("Recovery lock file enabled (%s)\n", reclockfile));
|
---|
3308 | ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
|
---|
3309 | ctdb_recovery_unlock(ctdb);
|
---|
3310 | talloc_free(tmp_ctx);
|
---|
3311 | return 0;
|
---|
3312 | }
|
---|
3313 |
|
---|
3314 |
|
---|
3315 | if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
|
---|
3316 | talloc_free(tmp_ctx);
|
---|
3317 | return 0;
|
---|
3318 | }
|
---|
3319 |
|
---|
3320 | DEBUG(DEBUG_NOTICE,
|
---|
3321 | ("Recovery lock file changed (now %s)\n", reclockfile));
|
---|
3322 | talloc_free(ctdb->recovery_lock_file);
|
---|
3323 | ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
|
---|
3324 | ctdb_recovery_unlock(ctdb);
|
---|
3325 |
|
---|
3326 | talloc_free(tmp_ctx);
|
---|
3327 | return 0;
|
---|
3328 | }
|
---|
3329 |
|
---|
3330 | static enum monitor_result validate_recovery_master(struct ctdb_recoverd *rec,
|
---|
3331 | TALLOC_CTX *mem_ctx)
|
---|
3332 | {
|
---|
3333 | struct ctdb_context *ctdb = rec->ctdb;
|
---|
3334 | uint32_t pnn = ctdb_get_pnn(ctdb);
|
---|
3335 | struct ctdb_node_map_old *nodemap = rec->nodemap;
|
---|
3336 | struct ctdb_node_map_old *recmaster_nodemap = NULL;
|
---|
3337 | int ret;
|
---|
3338 |
|
---|
3339 | /* When recovery daemon is started, recmaster is set to
|
---|
3340 | * "unknown" so it knows to start an election.
|
---|
3341 | */
|
---|
3342 | if (rec->recmaster == CTDB_UNKNOWN_PNN) {
|
---|
3343 | DEBUG(DEBUG_NOTICE,
|
---|
3344 | ("Initial recovery master set - forcing election\n"));
|
---|
3345 | return MONITOR_ELECTION_NEEDED;
|
---|
3346 | }
|
---|
3347 |
|
---|
3348 | /*
|
---|
3349 | * If the current recmaster does not have CTDB_CAP_RECMASTER,
|
---|
3350 | * but we have, then force an election and try to become the new
|
---|
3351 | * recmaster.
|
---|
3352 | */
|
---|
3353 | if (!ctdb_node_has_capabilities(rec->caps,
|
---|
3354 | rec->recmaster,
|
---|
3355 | CTDB_CAP_RECMASTER) &&
|
---|
3356 | (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
|
---|
3357 | !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
|
---|
3358 | DEBUG(DEBUG_ERR,
|
---|
3359 | (" Current recmaster node %u does not have CAP_RECMASTER,"
|
---|
3360 | " but we (node %u) have - force an election\n",
|
---|
3361 | rec->recmaster, pnn));
|
---|
3362 | return MONITOR_ELECTION_NEEDED;
|
---|
3363 | }
|
---|
3364 |
|
---|
3365 | /* Verify that the master node has not been deleted. This
|
---|
3366 | * should not happen because a node should always be shutdown
|
---|
3367 | * before being deleted, causing a new master to be elected
|
---|
3368 | * before now. However, if something strange has happened
|
---|
3369 | * then checking here will ensure we don't index beyond the
|
---|
3370 | * end of the nodemap array. */
|
---|
3371 | if (rec->recmaster >= nodemap->num) {
|
---|
3372 | DEBUG(DEBUG_ERR,
|
---|
3373 | ("Recmaster node %u has been deleted. Force election\n",
|
---|
3374 | rec->recmaster));
|
---|
3375 | return MONITOR_ELECTION_NEEDED;
|
---|
3376 | }
|
---|
3377 |
|
---|
3378 | /* if recovery master is disconnected/deleted we must elect a new recmaster */
|
---|
3379 | if (nodemap->nodes[rec->recmaster].flags &
|
---|
3380 | (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
|
---|
3381 | DEBUG(DEBUG_NOTICE,
|
---|
3382 | ("Recmaster node %u is disconnected/deleted. Force election\n",
|
---|
3383 | rec->recmaster));
|
---|
3384 | return MONITOR_ELECTION_NEEDED;
|
---|
3385 | }
|
---|
3386 |
|
---|
3387 | /* get nodemap from the recovery master to check if it is inactive */
|
---|
3388 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
|
---|
3389 | mem_ctx, &recmaster_nodemap);
|
---|
3390 | if (ret != 0) {
|
---|
3391 | DEBUG(DEBUG_ERR,
|
---|
3392 | (__location__
|
---|
3393 | " Unable to get nodemap from recovery master %u\n",
|
---|
3394 | rec->recmaster));
|
---|
3395 | return MONITOR_FAILED;
|
---|
3396 | }
|
---|
3397 |
|
---|
3398 |
|
---|
3399 | if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
|
---|
3400 | (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
|
---|
3401 | DEBUG(DEBUG_NOTICE,
|
---|
3402 | ("Recmaster node %u is inactive. Force election\n",
|
---|
3403 | rec->recmaster));
|
---|
3404 | /*
|
---|
3405 | * update our nodemap to carry the recmaster's notion of
|
---|
3406 | * its own flags, so that we don't keep freezing the
|
---|
3407 | * inactive recmaster node...
|
---|
3408 | */
|
---|
3409 | nodemap->nodes[rec->recmaster].flags =
|
---|
3410 | recmaster_nodemap->nodes[rec->recmaster].flags;
|
---|
3411 | return MONITOR_ELECTION_NEEDED;
|
---|
3412 | }
|
---|
3413 |
|
---|
3414 | return MONITOR_OK;
|
---|
3415 | }
|
---|
3416 |
|
---|
3417 | static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
|
---|
3418 | TALLOC_CTX *mem_ctx)
|
---|
3419 | {
|
---|
3420 | uint32_t pnn;
|
---|
3421 | struct ctdb_node_map_old *nodemap=NULL;
|
---|
3422 | struct ctdb_node_map_old **remote_nodemaps=NULL;
|
---|
3423 | struct ctdb_vnn_map *vnnmap=NULL;
|
---|
3424 | struct ctdb_vnn_map *remote_vnnmap=NULL;
|
---|
3425 | uint32_t num_lmasters;
|
---|
3426 | int32_t debug_level;
|
---|
3427 | int i, j, ret;
|
---|
3428 | bool self_ban;
|
---|
3429 |
|
---|
3430 |
|
---|
3431 | /* verify that the main daemon is still running */
|
---|
3432 | if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
|
---|
3433 | DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
|
---|
3434 | exit(-1);
|
---|
3435 | }
|
---|
3436 |
|
---|
3437 | /* ping the local daemon to tell it we are alive */
|
---|
3438 | ctdb_ctrl_recd_ping(ctdb);
|
---|
3439 |
|
---|
3440 | if (rec->election_timeout) {
|
---|
3441 | /* an election is in progress */
|
---|
3442 | return;
|
---|
3443 | }
|
---|
3444 |
|
---|
3445 | /* read the debug level from the parent and update locally */
|
---|
3446 | ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
|
---|
3447 | if (ret !=0) {
|
---|
3448 | DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
|
---|
3449 | return;
|
---|
3450 | }
|
---|
3451 | DEBUGLEVEL = debug_level;
|
---|
3452 |
|
---|
3453 | /* get relevant tunables */
|
---|
3454 | ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
|
---|
3455 | if (ret != 0) {
|
---|
3456 | DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
|
---|
3457 | return;
|
---|
3458 | }
|
---|
3459 |
|
---|
3460 | /* get runstate */
|
---|
3461 | ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
|
---|
3462 | CTDB_CURRENT_NODE, &ctdb->runstate);
|
---|
3463 | if (ret != 0) {
|
---|
3464 | DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
|
---|
3465 | return;
|
---|
3466 | }
|
---|
3467 |
|
---|
3468 | /* get the current recovery lock file from the server */
|
---|
3469 | if (update_recovery_lock_file(ctdb) != 0) {
|
---|
3470 | DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
|
---|
3471 | return;
|
---|
3472 | }
|
---|
3473 |
|
---|
3474 | pnn = ctdb_get_pnn(ctdb);
|
---|
3475 |
|
---|
3476 | /* get nodemap */
|
---|
3477 | TALLOC_FREE(rec->nodemap);
|
---|
3478 | ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
|
---|
3479 | if (ret != 0) {
|
---|
3480 | DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
|
---|
3481 | return;
|
---|
3482 | }
|
---|
3483 | nodemap = rec->nodemap;
|
---|
3484 |
|
---|
3485 | /* remember our own node flags */
|
---|
3486 | rec->node_flags = nodemap->nodes[pnn].flags;
|
---|
3487 |
|
---|
3488 | ban_misbehaving_nodes(rec, &self_ban);
|
---|
3489 | if (self_ban) {
|
---|
3490 | DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
|
---|
3491 | return;
|
---|
3492 | }
|
---|
3493 |
|
---|
3494 | /* if the local daemon is STOPPED or BANNED, we verify that the databases are
|
---|
3495 | also frozen and that the recmode is set to active.
|
---|
3496 | */
|
---|
3497 | if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
|
---|
3498 | /* If this node has become inactive then we want to
|
---|
3499 | * reduce the chances of it taking over the recovery
|
---|
3500 | * master role when it becomes active again. This
|
---|
3501 | * helps to stabilise the recovery master role so that
|
---|
3502 | * it stays on the most stable node.
|
---|
3503 | */
|
---|
3504 | rec->priority_time = timeval_current();
|
---|
3505 |
|
---|
3506 | ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
|
---|
3507 | if (ret != 0) {
|
---|
3508 | DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
|
---|
3509 | }
|
---|
3510 | if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
|
---|
3511 | DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
|
---|
3512 |
|
---|
3513 | ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
|
---|
3514 | if (ret != 0) {
|
---|
3515 | DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
|
---|
3516 |
|
---|
3517 | return;
|
---|
3518 | }
|
---|
3519 | }
|
---|
3520 | if (! rec->frozen_on_inactive) {
|
---|
3521 | ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
|
---|
3522 | CTDB_CURRENT_NODE);
|
---|
3523 | if (ret != 0) {
|
---|
3524 | DEBUG(DEBUG_ERR,
|
---|
3525 | (__location__ " Failed to freeze node "
|
---|
3526 | "in STOPPED or BANNED state\n"));
|
---|
3527 | return;
|
---|
3528 | }
|
---|
3529 |
|
---|
3530 | rec->frozen_on_inactive = true;
|
---|
3531 | }
|
---|
3532 |
|
---|
3533 | /* If this node is stopped or banned then it is not the recovery
|
---|
3534 | * master, so don't do anything. This prevents stopped or banned
|
---|
3535 | * node from starting election and sending unnecessary controls.
|
---|
3536 | */
|
---|
3537 | return;
|
---|
3538 | }
|
---|
3539 |
|
---|
3540 | rec->frozen_on_inactive = false;
|
---|
3541 |
|
---|
3542 | /* If we are not the recmaster then do some housekeeping */
|
---|
3543 | if (rec->recmaster != pnn) {
|
---|
3544 | /* Ignore any IP reallocate requests - only recmaster
|
---|
3545 | * processes them
|
---|
3546 | */
|
---|
3547 | TALLOC_FREE(rec->reallocate_requests);
|
---|
3548 | /* Clear any nodes that should be force rebalanced in
|
---|
3549 | * the next takeover run. If the recovery master role
|
---|
3550 | * has moved then we don't want to process these some
|
---|
3551 | * time in the future.
|
---|
3552 | */
|
---|
3553 | TALLOC_FREE(rec->force_rebalance_nodes);
|
---|
3554 | }
|
---|
3555 |
|
---|
3556 | /* Retrieve capabilities from all connected nodes */
|
---|
3557 | ret = update_capabilities(rec, nodemap);
|
---|
3558 | if (ret != 0) {
|
---|
3559 | DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
|
---|
3560 | return;
|
---|
3561 | }
|
---|
3562 |
|
---|
3563 | switch (validate_recovery_master(rec, mem_ctx)) {
|
---|
3564 | case MONITOR_RECOVERY_NEEDED:
|
---|
3565 | /* can not happen */
|
---|
3566 | return;
|
---|
3567 | case MONITOR_ELECTION_NEEDED:
|
---|
3568 | force_election(rec, pnn, nodemap);
|
---|
3569 | return;
|
---|
3570 | case MONITOR_OK:
|
---|
3571 | break;
|
---|
3572 | case MONITOR_FAILED:
|
---|
3573 | return;
|
---|
3574 | }
|
---|
3575 |
|
---|
3576 | /* verify that we have all ip addresses we should have and we dont
|
---|
3577 | * have addresses we shouldnt have.
|
---|
3578 | */
|
---|
3579 | if (ctdb->tunable.disable_ip_failover == 0 &&
|
---|
3580 | !ctdb_op_is_disabled(rec->takeover_run)) {
|
---|
3581 | if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
|
---|
3582 | DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
|
---|
3583 | }
|
---|
3584 | }
|
---|
3585 |
|
---|
3586 |
|
---|
3587 | /* if we are not the recmaster then we do not need to check
|
---|
3588 | if recovery is needed
|
---|
3589 | */
|
---|
3590 | if (pnn != rec->recmaster) {
|
---|
3591 | return;
|
---|
3592 | }
|
---|
3593 |
|
---|
3594 |
|
---|
3595 | /* ensure our local copies of flags are right */
|
---|
3596 | ret = update_local_flags(rec, nodemap);
|
---|
3597 | if (ret == MONITOR_ELECTION_NEEDED) {
|
---|
3598 | DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
|
---|
3599 | force_election(rec, pnn, nodemap);
|
---|
3600 | return;
|
---|
3601 | }
|
---|
3602 | if (ret != MONITOR_OK) {
|
---|
3603 | DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
|
---|
3604 | return;
|
---|
3605 | }
|
---|
3606 |
|
---|
3607 | if (ctdb->num_nodes != nodemap->num) {
|
---|
3608 | DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
|
---|
3609 | ctdb_load_nodes_file(ctdb);
|
---|
3610 | return;
|
---|
3611 | }
|
---|
3612 |
|
---|
3613 | /* verify that all active nodes agree that we are the recmaster */
|
---|
3614 | switch (verify_recmaster(rec, nodemap, pnn)) {
|
---|
3615 | case MONITOR_RECOVERY_NEEDED:
|
---|
3616 | /* can not happen */
|
---|
3617 | return;
|
---|
3618 | case MONITOR_ELECTION_NEEDED:
|
---|
3619 | force_election(rec, pnn, nodemap);
|
---|
3620 | return;
|
---|
3621 | case MONITOR_OK:
|
---|
3622 | break;
|
---|
3623 | case MONITOR_FAILED:
|
---|
3624 | return;
|
---|
3625 | }
|
---|
3626 |
|
---|
3627 |
|
---|
3628 | /* get the vnnmap */
|
---|
3629 | ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
|
---|
3630 | if (ret != 0) {
|
---|
3631 | DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
|
---|
3632 | return;
|
---|
3633 | }
|
---|
3634 |
|
---|
3635 | if (rec->need_recovery) {
|
---|
3636 | /* a previous recovery didn't finish */
|
---|
3637 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3638 | return;
|
---|
3639 | }
|
---|
3640 |
|
---|
3641 | /* verify that all active nodes are in normal mode
|
---|
3642 | and not in recovery mode
|
---|
3643 | */
|
---|
3644 | switch (verify_recmode(ctdb, nodemap)) {
|
---|
3645 | case MONITOR_RECOVERY_NEEDED:
|
---|
3646 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3647 | return;
|
---|
3648 | case MONITOR_FAILED:
|
---|
3649 | return;
|
---|
3650 | case MONITOR_ELECTION_NEEDED:
|
---|
3651 | /* can not happen */
|
---|
3652 | case MONITOR_OK:
|
---|
3653 | break;
|
---|
3654 | }
|
---|
3655 |
|
---|
3656 |
|
---|
3657 | if (ctdb->recovery_lock_file != NULL) {
|
---|
3658 | /* We must already hold the recovery lock */
|
---|
3659 | if (!ctdb_recovery_have_lock(ctdb)) {
|
---|
3660 | DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
|
---|
3661 | ctdb_set_culprit(rec, ctdb->pnn);
|
---|
3662 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3663 | return;
|
---|
3664 | }
|
---|
3665 | }
|
---|
3666 |
|
---|
3667 |
|
---|
3668 | /* if there are takeovers requested, perform it and notify the waiters */
|
---|
3669 | if (!ctdb_op_is_disabled(rec->takeover_run) &&
|
---|
3670 | rec->reallocate_requests) {
|
---|
3671 | process_ipreallocate_requests(ctdb, rec);
|
---|
3672 | }
|
---|
3673 |
|
---|
3674 | /* If recoveries are disabled then there is no use doing any
|
---|
3675 | * nodemap or flags checks. Recoveries might be disabled due
|
---|
3676 | * to "reloadnodes", so doing these checks might cause an
|
---|
3677 | * unnecessary recovery. */
|
---|
3678 | if (ctdb_op_is_disabled(rec->recovery)) {
|
---|
3679 | return;
|
---|
3680 | }
|
---|
3681 |
|
---|
3682 | /* get the nodemap for all active remote nodes
|
---|
3683 | */
|
---|
3684 | remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
|
---|
3685 | if (remote_nodemaps == NULL) {
|
---|
3686 | DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
|
---|
3687 | return;
|
---|
3688 | }
|
---|
3689 | for(i=0; i<nodemap->num; i++) {
|
---|
3690 | remote_nodemaps[i] = NULL;
|
---|
3691 | }
|
---|
3692 | if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
|
---|
3693 | DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
|
---|
3694 | return;
|
---|
3695 | }
|
---|
3696 |
|
---|
3697 | /* verify that all other nodes have the same nodemap as we have
|
---|
3698 | */
|
---|
3699 | for (j=0; j<nodemap->num; j++) {
|
---|
3700 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3701 | continue;
|
---|
3702 | }
|
---|
3703 |
|
---|
3704 | if (remote_nodemaps[j] == NULL) {
|
---|
3705 | DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
|
---|
3706 | ctdb_set_culprit(rec, j);
|
---|
3707 |
|
---|
3708 | return;
|
---|
3709 | }
|
---|
3710 |
|
---|
3711 | /* if the nodes disagree on how many nodes there are
|
---|
3712 | then this is a good reason to try recovery
|
---|
3713 | */
|
---|
3714 | if (remote_nodemaps[j]->num != nodemap->num) {
|
---|
3715 | DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
|
---|
3716 | nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
|
---|
3717 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3718 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3719 | return;
|
---|
3720 | }
|
---|
3721 |
|
---|
3722 | /* if the nodes disagree on which nodes exist and are
|
---|
3723 | active, then that is also a good reason to do recovery
|
---|
3724 | */
|
---|
3725 | for (i=0;i<nodemap->num;i++) {
|
---|
3726 | if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
|
---|
3727 | DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
|
---|
3728 | nodemap->nodes[j].pnn, i,
|
---|
3729 | remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
|
---|
3730 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3731 | do_recovery(rec, mem_ctx, pnn, nodemap,
|
---|
3732 | vnnmap);
|
---|
3733 | return;
|
---|
3734 | }
|
---|
3735 | }
|
---|
3736 | }
|
---|
3737 |
|
---|
3738 | /*
|
---|
3739 | * Update node flags obtained from each active node. This ensure we have
|
---|
3740 | * up-to-date information for all the nodes.
|
---|
3741 | */
|
---|
3742 | for (j=0; j<nodemap->num; j++) {
|
---|
3743 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3744 | continue;
|
---|
3745 | }
|
---|
3746 | nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
|
---|
3747 | }
|
---|
3748 |
|
---|
3749 | for (j=0; j<nodemap->num; j++) {
|
---|
3750 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3751 | continue;
|
---|
3752 | }
|
---|
3753 |
|
---|
3754 | /* verify the flags are consistent
|
---|
3755 | */
|
---|
3756 | for (i=0; i<nodemap->num; i++) {
|
---|
3757 | if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
|
---|
3758 | continue;
|
---|
3759 | }
|
---|
3760 |
|
---|
3761 | if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
|
---|
3762 | DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
|
---|
3763 | nodemap->nodes[j].pnn,
|
---|
3764 | nodemap->nodes[i].pnn,
|
---|
3765 | remote_nodemaps[j]->nodes[i].flags,
|
---|
3766 | nodemap->nodes[i].flags));
|
---|
3767 | if (i == j) {
|
---|
3768 | DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
|
---|
3769 | update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
|
---|
3770 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3771 | do_recovery(rec, mem_ctx, pnn, nodemap,
|
---|
3772 | vnnmap);
|
---|
3773 | return;
|
---|
3774 | } else {
|
---|
3775 | DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
|
---|
3776 | update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
|
---|
3777 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3778 | do_recovery(rec, mem_ctx, pnn, nodemap,
|
---|
3779 | vnnmap);
|
---|
3780 | return;
|
---|
3781 | }
|
---|
3782 | }
|
---|
3783 | }
|
---|
3784 | }
|
---|
3785 |
|
---|
3786 |
|
---|
3787 | /* count how many active nodes there are */
|
---|
3788 | num_lmasters = 0;
|
---|
3789 | for (i=0; i<nodemap->num; i++) {
|
---|
3790 | if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
|
---|
3791 | if (ctdb_node_has_capabilities(rec->caps,
|
---|
3792 | ctdb->nodes[i]->pnn,
|
---|
3793 | CTDB_CAP_LMASTER)) {
|
---|
3794 | num_lmasters++;
|
---|
3795 | }
|
---|
3796 | }
|
---|
3797 | }
|
---|
3798 |
|
---|
3799 |
|
---|
3800 | /* There must be the same number of lmasters in the vnn map as
|
---|
3801 | * there are active nodes with the lmaster capability... or
|
---|
3802 | * do a recovery.
|
---|
3803 | */
|
---|
3804 | if (vnnmap->size != num_lmasters) {
|
---|
3805 | DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
|
---|
3806 | vnnmap->size, num_lmasters));
|
---|
3807 | ctdb_set_culprit(rec, ctdb->pnn);
|
---|
3808 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3809 | return;
|
---|
3810 | }
|
---|
3811 |
|
---|
3812 | /* verify that all active nodes in the nodemap also exist in
|
---|
3813 | the vnnmap.
|
---|
3814 | */
|
---|
3815 | for (j=0; j<nodemap->num; j++) {
|
---|
3816 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3817 | continue;
|
---|
3818 | }
|
---|
3819 | if (nodemap->nodes[j].pnn == pnn) {
|
---|
3820 | continue;
|
---|
3821 | }
|
---|
3822 |
|
---|
3823 | for (i=0; i<vnnmap->size; i++) {
|
---|
3824 | if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
|
---|
3825 | break;
|
---|
3826 | }
|
---|
3827 | }
|
---|
3828 | if (i == vnnmap->size) {
|
---|
3829 | DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
|
---|
3830 | nodemap->nodes[j].pnn));
|
---|
3831 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3832 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3833 | return;
|
---|
3834 | }
|
---|
3835 | }
|
---|
3836 |
|
---|
3837 |
|
---|
3838 | /* verify that all other nodes have the same vnnmap
|
---|
3839 | and are from the same generation
|
---|
3840 | */
|
---|
3841 | for (j=0; j<nodemap->num; j++) {
|
---|
3842 | if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
|
---|
3843 | continue;
|
---|
3844 | }
|
---|
3845 | if (nodemap->nodes[j].pnn == pnn) {
|
---|
3846 | continue;
|
---|
3847 | }
|
---|
3848 |
|
---|
3849 | ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
|
---|
3850 | mem_ctx, &remote_vnnmap);
|
---|
3851 | if (ret != 0) {
|
---|
3852 | DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
|
---|
3853 | nodemap->nodes[j].pnn));
|
---|
3854 | return;
|
---|
3855 | }
|
---|
3856 |
|
---|
3857 | /* verify the vnnmap generation is the same */
|
---|
3858 | if (vnnmap->generation != remote_vnnmap->generation) {
|
---|
3859 | DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
|
---|
3860 | nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
|
---|
3861 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3862 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3863 | return;
|
---|
3864 | }
|
---|
3865 |
|
---|
3866 | /* verify the vnnmap size is the same */
|
---|
3867 | if (vnnmap->size != remote_vnnmap->size) {
|
---|
3868 | DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
|
---|
3869 | nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
|
---|
3870 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3871 | do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
|
---|
3872 | return;
|
---|
3873 | }
|
---|
3874 |
|
---|
3875 | /* verify the vnnmap is the same */
|
---|
3876 | for (i=0;i<vnnmap->size;i++) {
|
---|
3877 | if (remote_vnnmap->map[i] != vnnmap->map[i]) {
|
---|
3878 | DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
|
---|
3879 | nodemap->nodes[j].pnn));
|
---|
3880 | ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
|
---|
3881 | do_recovery(rec, mem_ctx, pnn, nodemap,
|
---|
3882 | vnnmap);
|
---|
3883 | return;
|
---|
3884 | }
|
---|
3885 | }
|
---|
3886 | }
|
---|
3887 |
|
---|
3888 | /* we might need to change who has what IP assigned */
|
---|
3889 | if (rec->need_takeover_run) {
|
---|
3890 | /* If takeover run fails, then the offending nodes are
|
---|
3891 | * assigned ban culprit counts. And we re-try takeover.
|
---|
3892 | * If takeover run fails repeatedly, the node would get
|
---|
3893 | * banned.
|
---|
3894 | */
|
---|
3895 | do_takeover_run(rec, nodemap, true);
|
---|
3896 | }
|
---|
3897 | }
|
---|
3898 |
|
---|
3899 | /*
|
---|
3900 | the main monitoring loop
|
---|
3901 | */
|
---|
3902 | static void monitor_cluster(struct ctdb_context *ctdb)
|
---|
3903 | {
|
---|
3904 | struct ctdb_recoverd *rec;
|
---|
3905 |
|
---|
3906 | DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
|
---|
3907 |
|
---|
3908 | rec = talloc_zero(ctdb, struct ctdb_recoverd);
|
---|
3909 | CTDB_NO_MEMORY_FATAL(ctdb, rec);
|
---|
3910 |
|
---|
3911 | rec->ctdb = ctdb;
|
---|
3912 | rec->recmaster = CTDB_UNKNOWN_PNN;
|
---|
3913 |
|
---|
3914 | rec->takeover_run = ctdb_op_init(rec, "takeover runs");
|
---|
3915 | CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
|
---|
3916 |
|
---|
3917 | rec->recovery = ctdb_op_init(rec, "recoveries");
|
---|
3918 | CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
|
---|
3919 |
|
---|
3920 | rec->priority_time = timeval_current();
|
---|
3921 | rec->frozen_on_inactive = false;
|
---|
3922 |
|
---|
3923 | /* register a message port for sending memory dumps */
|
---|
3924 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
|
---|
3925 |
|
---|
3926 | /* when a node is assigned banning credits */
|
---|
3927 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
|
---|
3928 | banning_handler, rec);
|
---|
3929 |
|
---|
3930 | /* register a message port for recovery elections */
|
---|
3931 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
|
---|
3932 |
|
---|
3933 | /* when nodes are disabled/enabled */
|
---|
3934 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
|
---|
3935 |
|
---|
3936 | /* when we are asked to puch out a flag change */
|
---|
3937 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
|
---|
3938 |
|
---|
3939 | /* register a message port for vacuum fetch */
|
---|
3940 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
|
---|
3941 |
|
---|
3942 | /* register a message port for reloadnodes */
|
---|
3943 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
|
---|
3944 |
|
---|
3945 | /* register a message port for performing a takeover run */
|
---|
3946 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
|
---|
3947 |
|
---|
3948 | /* register a message port for disabling the ip check for a short while */
|
---|
3949 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
|
---|
3950 |
|
---|
3951 | /* register a message port for updating the recovery daemons node assignment for an ip */
|
---|
3952 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
|
---|
3953 |
|
---|
3954 | /* register a message port for forcing a rebalance of a node next
|
---|
3955 | reallocation */
|
---|
3956 | ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
|
---|
3957 |
|
---|
3958 | /* Register a message port for disabling takeover runs */
|
---|
3959 | ctdb_client_set_message_handler(ctdb,
|
---|
3960 | CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
|
---|
3961 | disable_takeover_runs_handler, rec);
|
---|
3962 |
|
---|
3963 | /* Register a message port for disabling recoveries */
|
---|
3964 | ctdb_client_set_message_handler(ctdb,
|
---|
3965 | CTDB_SRVID_DISABLE_RECOVERIES,
|
---|
3966 | disable_recoveries_handler, rec);
|
---|
3967 |
|
---|
3968 | /* register a message port for detaching database */
|
---|
3969 | ctdb_client_set_message_handler(ctdb,
|
---|
3970 | CTDB_SRVID_DETACH_DATABASE,
|
---|
3971 | detach_database_handler, rec);
|
---|
3972 |
|
---|
3973 | for (;;) {
|
---|
3974 | TALLOC_CTX *mem_ctx = talloc_new(ctdb);
|
---|
3975 | struct timeval start;
|
---|
3976 | double elapsed;
|
---|
3977 |
|
---|
3978 | if (!mem_ctx) {
|
---|
3979 | DEBUG(DEBUG_CRIT,(__location__
|
---|
3980 | " Failed to create temp context\n"));
|
---|
3981 | exit(-1);
|
---|
3982 | }
|
---|
3983 |
|
---|
3984 | start = timeval_current();
|
---|
3985 | main_loop(ctdb, rec, mem_ctx);
|
---|
3986 | talloc_free(mem_ctx);
|
---|
3987 |
|
---|
3988 | /* we only check for recovery once every second */
|
---|
3989 | elapsed = timeval_elapsed(&start);
|
---|
3990 | if (elapsed < ctdb->tunable.recover_interval) {
|
---|
3991 | ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
|
---|
3992 | - elapsed);
|
---|
3993 | }
|
---|
3994 | }
|
---|
3995 | }
|
---|
3996 |
|
---|
3997 | /*
|
---|
3998 | event handler for when the main ctdbd dies
|
---|
3999 | */
|
---|
4000 | static void ctdb_recoverd_parent(struct tevent_context *ev,
|
---|
4001 | struct tevent_fd *fde,
|
---|
4002 | uint16_t flags, void *private_data)
|
---|
4003 | {
|
---|
4004 | DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
|
---|
4005 | _exit(1);
|
---|
4006 | }
|
---|
4007 |
|
---|
4008 | /*
|
---|
4009 | called regularly to verify that the recovery daemon is still running
|
---|
4010 | */
|
---|
4011 | static void ctdb_check_recd(struct tevent_context *ev,
|
---|
4012 | struct tevent_timer *te,
|
---|
4013 | struct timeval yt, void *p)
|
---|
4014 | {
|
---|
4015 | struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
|
---|
4016 |
|
---|
4017 | if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
|
---|
4018 | DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
|
---|
4019 |
|
---|
4020 | tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
|
---|
4021 | ctdb_restart_recd, ctdb);
|
---|
4022 |
|
---|
4023 | return;
|
---|
4024 | }
|
---|
4025 |
|
---|
4026 | tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
|
---|
4027 | timeval_current_ofs(30, 0),
|
---|
4028 | ctdb_check_recd, ctdb);
|
---|
4029 | }
|
---|
4030 |
|
---|
4031 | static void recd_sig_child_handler(struct tevent_context *ev,
|
---|
4032 | struct tevent_signal *se, int signum,
|
---|
4033 | int count, void *dont_care,
|
---|
4034 | void *private_data)
|
---|
4035 | {
|
---|
4036 | // struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
|
---|
4037 | int status;
|
---|
4038 | pid_t pid = -1;
|
---|
4039 |
|
---|
4040 | while (pid != 0) {
|
---|
4041 | pid = waitpid(-1, &status, WNOHANG);
|
---|
4042 | if (pid == -1) {
|
---|
4043 | if (errno != ECHILD) {
|
---|
4044 | DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
|
---|
4045 | }
|
---|
4046 | return;
|
---|
4047 | }
|
---|
4048 | if (pid > 0) {
|
---|
4049 | DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
|
---|
4050 | }
|
---|
4051 | }
|
---|
4052 | }
|
---|
4053 |
|
---|
4054 | /*
|
---|
4055 | startup the recovery daemon as a child of the main ctdb daemon
|
---|
4056 | */
|
---|
4057 | int ctdb_start_recoverd(struct ctdb_context *ctdb)
|
---|
4058 | {
|
---|
4059 | int fd[2];
|
---|
4060 | struct tevent_signal *se;
|
---|
4061 | struct tevent_fd *fde;
|
---|
4062 |
|
---|
4063 | if (pipe(fd) != 0) {
|
---|
4064 | return -1;
|
---|
4065 | }
|
---|
4066 |
|
---|
4067 | ctdb->recoverd_pid = ctdb_fork(ctdb);
|
---|
4068 | if (ctdb->recoverd_pid == -1) {
|
---|
4069 | return -1;
|
---|
4070 | }
|
---|
4071 |
|
---|
4072 | if (ctdb->recoverd_pid != 0) {
|
---|
4073 | talloc_free(ctdb->recd_ctx);
|
---|
4074 | ctdb->recd_ctx = talloc_new(ctdb);
|
---|
4075 | CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
|
---|
4076 |
|
---|
4077 | close(fd[0]);
|
---|
4078 | tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
|
---|
4079 | timeval_current_ofs(30, 0),
|
---|
4080 | ctdb_check_recd, ctdb);
|
---|
4081 | return 0;
|
---|
4082 | }
|
---|
4083 |
|
---|
4084 | close(fd[1]);
|
---|
4085 |
|
---|
4086 | srandom(getpid() ^ time(NULL));
|
---|
4087 |
|
---|
4088 | prctl_set_comment("ctdb_recovered");
|
---|
4089 | if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
|
---|
4090 | DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
|
---|
4091 | exit(1);
|
---|
4092 | }
|
---|
4093 |
|
---|
4094 | DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
|
---|
4095 |
|
---|
4096 | fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
|
---|
4097 | ctdb_recoverd_parent, &fd[0]);
|
---|
4098 | tevent_fd_set_auto_close(fde);
|
---|
4099 |
|
---|
4100 | /* set up a handler to pick up sigchld */
|
---|
4101 | se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
|
---|
4102 | recd_sig_child_handler, ctdb);
|
---|
4103 | if (se == NULL) {
|
---|
4104 | DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
|
---|
4105 | exit(1);
|
---|
4106 | }
|
---|
4107 |
|
---|
4108 | monitor_cluster(ctdb);
|
---|
4109 |
|
---|
4110 | DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
|
---|
4111 | return -1;
|
---|
4112 | }
|
---|
4113 |
|
---|
4114 | /*
|
---|
4115 | shutdown the recovery daemon
|
---|
4116 | */
|
---|
4117 | void ctdb_stop_recoverd(struct ctdb_context *ctdb)
|
---|
4118 | {
|
---|
4119 | if (ctdb->recoverd_pid == 0) {
|
---|
4120 | return;
|
---|
4121 | }
|
---|
4122 |
|
---|
4123 | DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
|
---|
4124 | ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
|
---|
4125 |
|
---|
4126 | TALLOC_FREE(ctdb->recd_ctx);
|
---|
4127 | TALLOC_FREE(ctdb->recd_ping_count);
|
---|
4128 | }
|
---|
4129 |
|
---|
4130 | static void ctdb_restart_recd(struct tevent_context *ev,
|
---|
4131 | struct tevent_timer *te,
|
---|
4132 | struct timeval t, void *private_data)
|
---|
4133 | {
|
---|
4134 | struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
|
---|
4135 |
|
---|
4136 | DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
|
---|
4137 | ctdb_stop_recoverd(ctdb);
|
---|
4138 | ctdb_start_recoverd(ctdb);
|
---|
4139 | }
|
---|