source: vendor/current/ctdb/server/ctdb_recoverd.c

Last change on this file was 989, checked in by Silvan Scherrer, 9 years ago

Samba Server: update vendor to version 4.4.7

File size: 112.9 KB
Line 
1/*
2 ctdb recovery daemon
3
4 Copyright (C) Ronnie Sahlberg 2007
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <http://www.gnu.org/licenses/>.
18*/
19
20#include "replace.h"
21#include "system/filesys.h"
22#include "system/time.h"
23#include "system/network.h"
24#include "system/wait.h"
25
26#include <popt.h>
27#include <talloc.h>
28#include <tevent.h>
29#include <tdb.h>
30
31#include "lib/tdb_wrap/tdb_wrap.h"
32#include "lib/util/dlinklist.h"
33#include "lib/util/debug.h"
34#include "lib/util/samba_util.h"
35#include "lib/util/util_process.h"
36
37#include "ctdb_private.h"
38#include "ctdb_client.h"
39
40#include "common/system.h"
41#include "common/cmdline.h"
42#include "common/common.h"
43#include "common/logging.h"
44
45
46/* List of SRVID requests that need to be processed */
47struct srvid_list {
48 struct srvid_list *next, *prev;
49 struct ctdb_srvid_message *request;
50};
51
52struct srvid_requests {
53 struct srvid_list *requests;
54};
55
56static void srvid_request_reply(struct ctdb_context *ctdb,
57 struct ctdb_srvid_message *request,
58 TDB_DATA result)
59{
60 /* Someone that sent srvid==0 does not want a reply */
61 if (request->srvid == 0) {
62 talloc_free(request);
63 return;
64 }
65
66 if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
67 result) == 0) {
68 DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
69 (unsigned)request->pnn,
70 (unsigned long long)request->srvid));
71 } else {
72 DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
73 (unsigned)request->pnn,
74 (unsigned long long)request->srvid));
75 }
76
77 talloc_free(request);
78}
79
80static void srvid_requests_reply(struct ctdb_context *ctdb,
81 struct srvid_requests **requests,
82 TDB_DATA result)
83{
84 struct srvid_list *r;
85
86 for (r = (*requests)->requests; r != NULL; r = r->next) {
87 srvid_request_reply(ctdb, r->request, result);
88 }
89
90 /* Free the list structure... */
91 TALLOC_FREE(*requests);
92}
93
94static void srvid_request_add(struct ctdb_context *ctdb,
95 struct srvid_requests **requests,
96 struct ctdb_srvid_message *request)
97{
98 struct srvid_list *t;
99 int32_t ret;
100 TDB_DATA result;
101
102 if (*requests == NULL) {
103 *requests = talloc_zero(ctdb, struct srvid_requests);
104 if (*requests == NULL) {
105 goto nomem;
106 }
107 }
108
109 t = talloc_zero(*requests, struct srvid_list);
110 if (t == NULL) {
111 /* If *requests was just allocated above then free it */
112 if ((*requests)->requests == NULL) {
113 TALLOC_FREE(*requests);
114 }
115 goto nomem;
116 }
117
118 t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
119 DLIST_ADD((*requests)->requests, t);
120
121 return;
122
123nomem:
124 /* Failed to add the request to the list. Send a fail. */
125 DEBUG(DEBUG_ERR, (__location__
126 " Out of memory, failed to queue SRVID request\n"));
127 ret = -ENOMEM;
128 result.dsize = sizeof(ret);
129 result.dptr = (uint8_t *)&ret;
130 srvid_request_reply(ctdb, request, result);
131}
132
133/* An abstraction to allow an operation (takeover runs, recoveries,
134 * ...) to be disabled for a given timeout */
135struct ctdb_op_state {
136 struct tevent_timer *timer;
137 bool in_progress;
138 const char *name;
139};
140
141static struct ctdb_op_state *ctdb_op_init(TALLOC_CTX *mem_ctx, const char *name)
142{
143 struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
144
145 if (state != NULL) {
146 state->in_progress = false;
147 state->name = name;
148 }
149
150 return state;
151}
152
153static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
154{
155 return state->timer != NULL;
156}
157
158static bool ctdb_op_begin(struct ctdb_op_state *state)
159{
160 if (ctdb_op_is_disabled(state)) {
161 DEBUG(DEBUG_NOTICE,
162 ("Unable to begin - %s are disabled\n", state->name));
163 return false;
164 }
165
166 state->in_progress = true;
167 return true;
168}
169
170static bool ctdb_op_end(struct ctdb_op_state *state)
171{
172 return state->in_progress = false;
173}
174
175static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
176{
177 return state->in_progress;
178}
179
180static void ctdb_op_enable(struct ctdb_op_state *state)
181{
182 TALLOC_FREE(state->timer);
183}
184
185static void ctdb_op_timeout_handler(struct tevent_context *ev,
186 struct tevent_timer *te,
187 struct timeval yt, void *p)
188{
189 struct ctdb_op_state *state =
190 talloc_get_type(p, struct ctdb_op_state);
191
192 DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
193 ctdb_op_enable(state);
194}
195
196static int ctdb_op_disable(struct ctdb_op_state *state,
197 struct tevent_context *ev,
198 uint32_t timeout)
199{
200 if (timeout == 0) {
201 DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
202 ctdb_op_enable(state);
203 return 0;
204 }
205
206 if (state->in_progress) {
207 DEBUG(DEBUG_ERR,
208 ("Unable to disable %s - in progress\n", state->name));
209 return -EAGAIN;
210 }
211
212 DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
213 state->name, timeout));
214
215 /* Clear any old timers */
216 talloc_free(state->timer);
217
218 /* Arrange for the timeout to occur */
219 state->timer = tevent_add_timer(ev, state,
220 timeval_current_ofs(timeout, 0),
221 ctdb_op_timeout_handler, state);
222 if (state->timer == NULL) {
223 DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
224 return -ENOMEM;
225 }
226
227 return 0;
228}
229
230struct ctdb_banning_state {
231 uint32_t count;
232 struct timeval last_reported_time;
233};
234
235/*
236 private state of recovery daemon
237 */
238struct ctdb_recoverd {
239 struct ctdb_context *ctdb;
240 uint32_t recmaster;
241 uint32_t last_culprit_node;
242 struct ctdb_node_map_old *nodemap;
243 struct timeval priority_time;
244 bool need_takeover_run;
245 bool need_recovery;
246 uint32_t node_flags;
247 struct tevent_timer *send_election_te;
248 struct tevent_timer *election_timeout;
249 struct srvid_requests *reallocate_requests;
250 struct ctdb_op_state *takeover_run;
251 struct ctdb_op_state *recovery;
252 struct ctdb_iface_list_old *ifaces;
253 uint32_t *force_rebalance_nodes;
254 struct ctdb_node_capabilities *caps;
255 bool frozen_on_inactive;
256};
257
258#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
259#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
260
261static void ctdb_restart_recd(struct tevent_context *ev,
262 struct tevent_timer *te, struct timeval t,
263 void *private_data);
264
265/*
266 ban a node for a period of time
267 */
268static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
269{
270 int ret;
271 struct ctdb_context *ctdb = rec->ctdb;
272 struct ctdb_ban_state bantime;
273
274 if (!ctdb_validate_pnn(ctdb, pnn)) {
275 DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
276 return;
277 }
278
279 DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
280
281 bantime.pnn = pnn;
282 bantime.time = ban_time;
283
284 ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
285 if (ret != 0) {
286 DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
287 return;
288 }
289
290}
291
292enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
293
294
295/*
296 remember the trouble maker
297 */
298static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
299{
300 struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
301 struct ctdb_banning_state *ban_state;
302
303 if (culprit > ctdb->num_nodes) {
304 DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
305 return;
306 }
307
308 /* If we are banned or stopped, do not set other nodes as culprits */
309 if (rec->node_flags & NODE_FLAGS_INACTIVE) {
310 DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
311 return;
312 }
313
314 if (ctdb->nodes[culprit]->ban_state == NULL) {
315 ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
316 CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
317
318
319 }
320 ban_state = ctdb->nodes[culprit]->ban_state;
321 if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
322 /* this was the first time in a long while this node
323 misbehaved so we will forgive any old transgressions.
324 */
325 ban_state->count = 0;
326 }
327
328 ban_state->count += count;
329 ban_state->last_reported_time = timeval_current();
330 rec->last_culprit_node = culprit;
331}
332
333/*
334 remember the trouble maker
335 */
336static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
337{
338 ctdb_set_culprit_count(rec, culprit, 1);
339}
340
341
342/* this callback is called for every node that failed to execute the
343 recovered event
344*/
345static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
346{
347 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
348
349 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
350
351 ctdb_set_culprit(rec, node_pnn);
352}
353
354/*
355 run the "recovered" eventscript on all nodes
356 */
357static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, const char *caller)
358{
359 TALLOC_CTX *tmp_ctx;
360 uint32_t *nodes;
361 struct ctdb_context *ctdb = rec->ctdb;
362
363 tmp_ctx = talloc_new(ctdb);
364 CTDB_NO_MEMORY(ctdb, tmp_ctx);
365
366 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
367 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
368 nodes, 0,
369 CONTROL_TIMEOUT(), false, tdb_null,
370 NULL, recovered_fail_callback,
371 rec) != 0) {
372 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
373
374 talloc_free(tmp_ctx);
375 return -1;
376 }
377
378 talloc_free(tmp_ctx);
379 return 0;
380}
381
382/* this callback is called for every node that failed to execute the
383 start recovery event
384*/
385static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
386{
387 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
388
389 DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
390
391 ctdb_set_culprit(rec, node_pnn);
392}
393
394/*
395 run the "startrecovery" eventscript on all nodes
396 */
397static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
398{
399 TALLOC_CTX *tmp_ctx;
400 uint32_t *nodes;
401 struct ctdb_context *ctdb = rec->ctdb;
402
403 tmp_ctx = talloc_new(ctdb);
404 CTDB_NO_MEMORY(ctdb, tmp_ctx);
405
406 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
407 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
408 nodes, 0,
409 CONTROL_TIMEOUT(), false, tdb_null,
410 NULL,
411 startrecovery_fail_callback,
412 rec) != 0) {
413 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
414 talloc_free(tmp_ctx);
415 return -1;
416 }
417
418 talloc_free(tmp_ctx);
419 return 0;
420}
421
422/*
423 Retrieve capabilities from all connected nodes
424 */
425static int update_capabilities(struct ctdb_recoverd *rec,
426 struct ctdb_node_map_old *nodemap)
427{
428 uint32_t *capp;
429 TALLOC_CTX *tmp_ctx;
430 struct ctdb_node_capabilities *caps;
431 struct ctdb_context *ctdb = rec->ctdb;
432
433 tmp_ctx = talloc_new(rec);
434 CTDB_NO_MEMORY(ctdb, tmp_ctx);
435
436 caps = ctdb_get_capabilities(ctdb, tmp_ctx,
437 CONTROL_TIMEOUT(), nodemap);
438
439 if (caps == NULL) {
440 DEBUG(DEBUG_ERR,
441 (__location__ " Failed to get node capabilities\n"));
442 talloc_free(tmp_ctx);
443 return -1;
444 }
445
446 capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
447 if (capp == NULL) {
448 DEBUG(DEBUG_ERR,
449 (__location__
450 " Capabilities don't include current node.\n"));
451 talloc_free(tmp_ctx);
452 return -1;
453 }
454 ctdb->capabilities = *capp;
455
456 TALLOC_FREE(rec->caps);
457 rec->caps = talloc_steal(rec, caps);
458
459 talloc_free(tmp_ctx);
460 return 0;
461}
462
463static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
464{
465 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
466
467 DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
468 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
469}
470
471static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
472{
473 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
474
475 DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
476 ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
477}
478
479/*
480 change recovery mode on all nodes
481 */
482static int set_recovery_mode(struct ctdb_context *ctdb,
483 struct ctdb_recoverd *rec,
484 struct ctdb_node_map_old *nodemap,
485 uint32_t rec_mode, bool freeze)
486{
487 TDB_DATA data;
488 uint32_t *nodes;
489 TALLOC_CTX *tmp_ctx;
490
491 tmp_ctx = talloc_new(ctdb);
492 CTDB_NO_MEMORY(ctdb, tmp_ctx);
493
494 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
495
496 data.dsize = sizeof(uint32_t);
497 data.dptr = (unsigned char *)&rec_mode;
498
499 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
500 nodes, 0,
501 CONTROL_TIMEOUT(),
502 false, data,
503 NULL, NULL,
504 NULL) != 0) {
505 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
506 talloc_free(tmp_ctx);
507 return -1;
508 }
509
510 /* freeze all nodes */
511 if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
512 int i;
513
514 for (i=1; i<=NUM_DB_PRIORITIES; i++) {
515 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
516 nodes, i,
517 CONTROL_TIMEOUT(),
518 false, tdb_null,
519 NULL,
520 set_recmode_fail_callback,
521 rec) != 0) {
522 DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
523 talloc_free(tmp_ctx);
524 return -1;
525 }
526 }
527 }
528
529 talloc_free(tmp_ctx);
530 return 0;
531}
532
533/* update all remote nodes to use the same db priority that we have
534 this can fail if the remove node has not yet been upgraded to
535 support this function, so we always return success and never fail
536 a recovery if this call fails.
537*/
538static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
539 struct ctdb_node_map_old *nodemap,
540 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
541{
542 int db;
543
544 /* step through all local databases */
545 for (db=0; db<dbmap->num;db++) {
546 struct ctdb_db_priority db_prio;
547 int ret;
548
549 db_prio.db_id = dbmap->dbs[db].db_id;
550 ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
551 if (ret != 0) {
552 DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
553 continue;
554 }
555
556 DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
557
558 ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
559 CTDB_CURRENT_NODE, &db_prio);
560 if (ret != 0) {
561 DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
562 db_prio.db_id));
563 }
564 }
565
566 return 0;
567}
568
569/*
570 ensure all other nodes have attached to any databases that we have
571 */
572static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
573 uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX *mem_ctx)
574{
575 int i, j, db, ret;
576 struct ctdb_dbid_map_old *remote_dbmap;
577
578 /* verify that all other nodes have all our databases */
579 for (j=0; j<nodemap->num; j++) {
580 /* we don't need to ourself ourselves */
581 if (nodemap->nodes[j].pnn == pnn) {
582 continue;
583 }
584 /* don't check nodes that are unavailable */
585 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
586 continue;
587 }
588
589 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
590 mem_ctx, &remote_dbmap);
591 if (ret != 0) {
592 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
593 return -1;
594 }
595
596 /* step through all local databases */
597 for (db=0; db<dbmap->num;db++) {
598 const char *name;
599
600
601 for (i=0;i<remote_dbmap->num;i++) {
602 if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
603 break;
604 }
605 }
606 /* the remote node already have this database */
607 if (i!=remote_dbmap->num) {
608 continue;
609 }
610 /* ok so we need to create this database */
611 ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
612 dbmap->dbs[db].db_id, mem_ctx,
613 &name);
614 if (ret != 0) {
615 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
616 return -1;
617 }
618 ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
619 nodemap->nodes[j].pnn,
620 mem_ctx, name,
621 dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
622 if (ret != 0) {
623 DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
624 return -1;
625 }
626 }
627 }
628
629 return 0;
630}
631
632
633/*
634 ensure we are attached to any databases that anyone else is attached to
635 */
636static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
637 uint32_t pnn, struct ctdb_dbid_map_old **dbmap, TALLOC_CTX *mem_ctx)
638{
639 int i, j, db, ret;
640 struct ctdb_dbid_map_old *remote_dbmap;
641
642 /* verify that we have all database any other node has */
643 for (j=0; j<nodemap->num; j++) {
644 /* we don't need to ourself ourselves */
645 if (nodemap->nodes[j].pnn == pnn) {
646 continue;
647 }
648 /* don't check nodes that are unavailable */
649 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
650 continue;
651 }
652
653 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
654 mem_ctx, &remote_dbmap);
655 if (ret != 0) {
656 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
657 return -1;
658 }
659
660 /* step through all databases on the remote node */
661 for (db=0; db<remote_dbmap->num;db++) {
662 const char *name;
663
664 for (i=0;i<(*dbmap)->num;i++) {
665 if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
666 break;
667 }
668 }
669 /* we already have this db locally */
670 if (i!=(*dbmap)->num) {
671 continue;
672 }
673 /* ok so we need to create this database and
674 rebuild dbmap
675 */
676 ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
677 remote_dbmap->dbs[db].db_id, mem_ctx, &name);
678 if (ret != 0) {
679 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
680 nodemap->nodes[j].pnn));
681 return -1;
682 }
683 ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
684 remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
685 if (ret != 0) {
686 DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
687 return -1;
688 }
689 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
690 if (ret != 0) {
691 DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
692 return -1;
693 }
694 }
695 }
696
697 return 0;
698}
699
700
701/*
702 pull the remote database contents from one node into the recdb
703 */
704static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
705 struct tdb_wrap *recdb, uint32_t dbid)
706{
707 int ret;
708 TDB_DATA outdata;
709 struct ctdb_marshall_buffer *reply;
710 struct ctdb_rec_data_old *recdata;
711 int i;
712 TALLOC_CTX *tmp_ctx = talloc_new(recdb);
713
714 ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
715 CONTROL_TIMEOUT(), &outdata);
716 if (ret != 0) {
717 DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
718 talloc_free(tmp_ctx);
719 return -1;
720 }
721
722 reply = (struct ctdb_marshall_buffer *)outdata.dptr;
723
724 if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
725 DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
726 talloc_free(tmp_ctx);
727 return -1;
728 }
729
730 recdata = (struct ctdb_rec_data_old *)&reply->data[0];
731
732 for (i=0;
733 i<reply->count;
734 recdata = (struct ctdb_rec_data_old *)(recdata->length + (uint8_t *)recdata), i++) {
735 TDB_DATA key, data;
736 struct ctdb_ltdb_header *hdr;
737 TDB_DATA existing;
738
739 key.dptr = &recdata->data[0];
740 key.dsize = recdata->keylen;
741 data.dptr = &recdata->data[key.dsize];
742 data.dsize = recdata->datalen;
743
744 hdr = (struct ctdb_ltdb_header *)data.dptr;
745
746 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
747 DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
748 talloc_free(tmp_ctx);
749 return -1;
750 }
751
752 /* fetch the existing record, if any */
753 existing = tdb_fetch(recdb->tdb, key);
754
755 if (existing.dptr != NULL) {
756 struct ctdb_ltdb_header header;
757 if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
758 DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
759 (unsigned)existing.dsize, srcnode));
760 free(existing.dptr);
761 talloc_free(tmp_ctx);
762 return -1;
763 }
764 header = *(struct ctdb_ltdb_header *)existing.dptr;
765 free(existing.dptr);
766 if (!(header.rsn < hdr->rsn ||
767 (header.dmaster != ctdb_get_pnn(ctdb) &&
768 header.rsn == hdr->rsn))) {
769 continue;
770 }
771 }
772
773 if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
774 DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
775 talloc_free(tmp_ctx);
776 return -1;
777 }
778 }
779
780 talloc_free(tmp_ctx);
781
782 return 0;
783}
784
785
786struct pull_seqnum_cbdata {
787 int failed;
788 uint32_t pnn;
789 uint64_t seqnum;
790};
791
792static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
793{
794 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
795 uint64_t seqnum;
796
797 if (cb_data->failed != 0) {
798 DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
799 return;
800 }
801
802 if (res != 0) {
803 DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
804 cb_data->failed = 1;
805 return;
806 }
807
808 if (outdata.dsize != sizeof(uint64_t)) {
809 DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
810 cb_data->failed = -1;
811 return;
812 }
813
814 seqnum = *((uint64_t *)outdata.dptr);
815
816 if (seqnum > cb_data->seqnum ||
817 (cb_data->pnn == -1 && seqnum == 0)) {
818 cb_data->seqnum = seqnum;
819 cb_data->pnn = node_pnn;
820 }
821}
822
823static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
824{
825 struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
826
827 DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
828 cb_data->failed = 1;
829}
830
831static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
832 struct ctdb_recoverd *rec,
833 struct ctdb_node_map_old *nodemap,
834 struct tdb_wrap *recdb, uint32_t dbid)
835{
836 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
837 uint32_t *nodes;
838 TDB_DATA data;
839 uint32_t outdata[2];
840 struct pull_seqnum_cbdata *cb_data;
841
842 DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
843
844 outdata[0] = dbid;
845 outdata[1] = 0;
846
847 data.dsize = sizeof(outdata);
848 data.dptr = (uint8_t *)&outdata[0];
849
850 cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
851 if (cb_data == NULL) {
852 DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
853 talloc_free(tmp_ctx);
854 return -1;
855 }
856
857 cb_data->failed = 0;
858 cb_data->pnn = -1;
859 cb_data->seqnum = 0;
860
861 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
862 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
863 nodes, 0,
864 CONTROL_TIMEOUT(), false, data,
865 pull_seqnum_cb,
866 pull_seqnum_fail_cb,
867 cb_data) != 0) {
868 DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
869
870 talloc_free(tmp_ctx);
871 return -1;
872 }
873
874 if (cb_data->failed != 0) {
875 DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
876 talloc_free(tmp_ctx);
877 return -1;
878 }
879
880 if (cb_data->pnn == -1) {
881 DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
882 talloc_free(tmp_ctx);
883 return -1;
884 }
885
886 DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
887
888 if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
889 DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
890 talloc_free(tmp_ctx);
891 return -1;
892 }
893
894 talloc_free(tmp_ctx);
895 return 0;
896}
897
898
899/*
900 pull all the remote database contents into the recdb
901 */
902static int pull_remote_database(struct ctdb_context *ctdb,
903 struct ctdb_recoverd *rec,
904 struct ctdb_node_map_old *nodemap,
905 struct tdb_wrap *recdb, uint32_t dbid,
906 bool persistent)
907{
908 int j;
909
910 if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
911 int ret;
912 ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
913 if (ret == 0) {
914 return 0;
915 }
916 }
917
918 /* pull all records from all other nodes across onto this node
919 (this merges based on rsn)
920 */
921 for (j=0; j<nodemap->num; j++) {
922 /* don't merge from nodes that are unavailable */
923 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
924 continue;
925 }
926 if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
927 DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
928 nodemap->nodes[j].pnn));
929 ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
930 return -1;
931 }
932 }
933
934 return 0;
935}
936
937
938/*
939 update flags on all active nodes
940 */
941static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap, uint32_t pnn, uint32_t flags)
942{
943 int ret;
944
945 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
946 if (ret != 0) {
947 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
948 return -1;
949 }
950
951 return 0;
952}
953
954/*
955 ensure all nodes have the same vnnmap we do
956 */
957static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
958 uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
959{
960 int j, ret;
961
962 /* push the new vnn map out to all the nodes */
963 for (j=0; j<nodemap->num; j++) {
964 /* don't push to nodes that are unavailable */
965 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
966 continue;
967 }
968
969 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
970 if (ret != 0) {
971 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
972 return -1;
973 }
974 }
975
976 return 0;
977}
978
979
980/*
981 called when a vacuum fetch has completed - just free it and do the next one
982 */
983static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
984{
985 talloc_free(state);
986}
987
988
989/**
990 * Process one elements of the vacuum fetch list:
991 * Migrate it over to us with the special flag
992 * CTDB_CALL_FLAG_VACUUM_MIGRATION.
993 */
994static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
995 uint32_t pnn,
996 struct ctdb_rec_data_old *r)
997{
998 struct ctdb_client_call_state *state;
999 TDB_DATA data;
1000 struct ctdb_ltdb_header *hdr;
1001 struct ctdb_call call;
1002
1003 ZERO_STRUCT(call);
1004 call.call_id = CTDB_NULL_FUNC;
1005 call.flags = CTDB_IMMEDIATE_MIGRATION;
1006 call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1007
1008 call.key.dptr = &r->data[0];
1009 call.key.dsize = r->keylen;
1010
1011 /* ensure we don't block this daemon - just skip a record if we can't get
1012 the chainlock */
1013 if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1014 return true;
1015 }
1016
1017 data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1018 if (data.dptr == NULL) {
1019 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1020 return true;
1021 }
1022
1023 if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1024 free(data.dptr);
1025 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1026 return true;
1027 }
1028
1029 hdr = (struct ctdb_ltdb_header *)data.dptr;
1030 if (hdr->dmaster == pnn) {
1031 /* its already local */
1032 free(data.dptr);
1033 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1034 return true;
1035 }
1036
1037 free(data.dptr);
1038
1039 state = ctdb_call_send(ctdb_db, &call);
1040 tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1041 if (state == NULL) {
1042 DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1043 return false;
1044 }
1045 state->async.fn = vacuum_fetch_callback;
1046 state->async.private_data = NULL;
1047
1048 return true;
1049}
1050
1051
1052/*
1053 handler for vacuum fetch
1054*/
1055static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1056 void *private_data)
1057{
1058 struct ctdb_recoverd *rec = talloc_get_type(
1059 private_data, struct ctdb_recoverd);
1060 struct ctdb_context *ctdb = rec->ctdb;
1061 struct ctdb_marshall_buffer *recs;
1062 int ret, i;
1063 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1064 const char *name;
1065 struct ctdb_dbid_map_old *dbmap=NULL;
1066 bool persistent = false;
1067 struct ctdb_db_context *ctdb_db;
1068 struct ctdb_rec_data_old *r;
1069
1070 recs = (struct ctdb_marshall_buffer *)data.dptr;
1071
1072 if (recs->count == 0) {
1073 goto done;
1074 }
1075
1076 /* work out if the database is persistent */
1077 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1078 if (ret != 0) {
1079 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1080 goto done;
1081 }
1082
1083 for (i=0;i<dbmap->num;i++) {
1084 if (dbmap->dbs[i].db_id == recs->db_id) {
1085 persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1086 break;
1087 }
1088 }
1089 if (i == dbmap->num) {
1090 DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1091 goto done;
1092 }
1093
1094 /* find the name of this database */
1095 if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1096 DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1097 goto done;
1098 }
1099
1100 /* attach to it */
1101 ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1102 if (ctdb_db == NULL) {
1103 DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1104 goto done;
1105 }
1106
1107 r = (struct ctdb_rec_data_old *)&recs->data[0];
1108 while (recs->count) {
1109 bool ok;
1110
1111 ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1112 if (!ok) {
1113 break;
1114 }
1115
1116 r = (struct ctdb_rec_data_old *)(r->length + (uint8_t *)r);
1117 recs->count--;
1118 }
1119
1120done:
1121 talloc_free(tmp_ctx);
1122}
1123
1124
1125/*
1126 * handler for database detach
1127 */
1128static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1129 void *private_data)
1130{
1131 struct ctdb_recoverd *rec = talloc_get_type(
1132 private_data, struct ctdb_recoverd);
1133 struct ctdb_context *ctdb = rec->ctdb;
1134 uint32_t db_id;
1135 struct ctdb_db_context *ctdb_db;
1136
1137 if (data.dsize != sizeof(db_id)) {
1138 return;
1139 }
1140 db_id = *(uint32_t *)data.dptr;
1141
1142 ctdb_db = find_ctdb_db(ctdb, db_id);
1143 if (ctdb_db == NULL) {
1144 /* database is not attached */
1145 return;
1146 }
1147
1148 DLIST_REMOVE(ctdb->db_list, ctdb_db);
1149
1150 DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1151 ctdb_db->db_name));
1152 talloc_free(ctdb_db);
1153}
1154
1155/*
1156 called when ctdb_wait_timeout should finish
1157 */
1158static void ctdb_wait_handler(struct tevent_context *ev,
1159 struct tevent_timer *te,
1160 struct timeval yt, void *p)
1161{
1162 uint32_t *timed_out = (uint32_t *)p;
1163 (*timed_out) = 1;
1164}
1165
1166/*
1167 wait for a given number of seconds
1168 */
1169static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1170{
1171 uint32_t timed_out = 0;
1172 time_t usecs = (secs - (time_t)secs) * 1000000;
1173 tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1174 ctdb_wait_handler, &timed_out);
1175 while (!timed_out) {
1176 tevent_loop_once(ctdb->ev);
1177 }
1178}
1179
1180/*
1181 called when an election times out (ends)
1182 */
1183static void ctdb_election_timeout(struct tevent_context *ev,
1184 struct tevent_timer *te,
1185 struct timeval t, void *p)
1186{
1187 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1188 rec->election_timeout = NULL;
1189 fast_start = false;
1190
1191 DEBUG(DEBUG_WARNING,("Election period ended\n"));
1192}
1193
1194
1195/*
1196 wait for an election to finish. It finished election_timeout seconds after
1197 the last election packet is received
1198 */
1199static void ctdb_wait_election(struct ctdb_recoverd *rec)
1200{
1201 struct ctdb_context *ctdb = rec->ctdb;
1202 while (rec->election_timeout) {
1203 tevent_loop_once(ctdb->ev);
1204 }
1205}
1206
1207/*
1208 Update our local flags from all remote connected nodes.
1209 This is only run when we are or we belive we are the recovery master
1210 */
1211static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap)
1212{
1213 int j;
1214 struct ctdb_context *ctdb = rec->ctdb;
1215 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1216
1217 /* get the nodemap for all active remote nodes and verify
1218 they are the same as for this node
1219 */
1220 for (j=0; j<nodemap->num; j++) {
1221 struct ctdb_node_map_old *remote_nodemap=NULL;
1222 int ret;
1223
1224 if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1225 continue;
1226 }
1227 if (nodemap->nodes[j].pnn == ctdb->pnn) {
1228 continue;
1229 }
1230
1231 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1232 mem_ctx, &remote_nodemap);
1233 if (ret != 0) {
1234 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1235 nodemap->nodes[j].pnn));
1236 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1237 talloc_free(mem_ctx);
1238 return MONITOR_FAILED;
1239 }
1240 if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1241 /* We should tell our daemon about this so it
1242 updates its flags or else we will log the same
1243 message again in the next iteration of recovery.
1244 Since we are the recovery master we can just as
1245 well update the flags on all nodes.
1246 */
1247 ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1248 if (ret != 0) {
1249 DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1250 return -1;
1251 }
1252
1253 /* Update our local copy of the flags in the recovery
1254 daemon.
1255 */
1256 DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1257 nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1258 nodemap->nodes[j].flags));
1259 nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1260 }
1261 talloc_free(remote_nodemap);
1262 }
1263 talloc_free(mem_ctx);
1264 return MONITOR_OK;
1265}
1266
1267
1268/* Create a new random generation id.
1269 The generation id can not be the INVALID_GENERATION id
1270*/
1271static uint32_t new_generation(void)
1272{
1273 uint32_t generation;
1274
1275 while (1) {
1276 generation = random();
1277
1278 if (generation != INVALID_GENERATION) {
1279 break;
1280 }
1281 }
1282
1283 return generation;
1284}
1285
1286
1287/*
1288 create a temporary working database
1289 */
1290static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
1291{
1292 char *name;
1293 struct tdb_wrap *recdb;
1294 unsigned tdb_flags;
1295
1296 /* open up the temporary recovery database */
1297 name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1298 ctdb->db_directory_state,
1299 ctdb->pnn);
1300 if (name == NULL) {
1301 return NULL;
1302 }
1303 unlink(name);
1304
1305 tdb_flags = TDB_NOLOCK;
1306 if (ctdb->valgrinding) {
1307 tdb_flags |= TDB_NOMMAP;
1308 }
1309 tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
1310
1311 recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1312 tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
1313 if (recdb == NULL) {
1314 DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1315 }
1316
1317 talloc_free(name);
1318
1319 return recdb;
1320}
1321
1322
1323/*
1324 a traverse function for pulling all relevant records from recdb
1325 */
1326struct recdb_data {
1327 struct ctdb_context *ctdb;
1328 struct ctdb_marshall_buffer *recdata;
1329 uint32_t len;
1330 uint32_t allocated_len;
1331 bool failed;
1332 bool persistent;
1333};
1334
1335static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
1336{
1337 struct recdb_data *params = (struct recdb_data *)p;
1338 struct ctdb_rec_data_old *recdata;
1339 struct ctdb_ltdb_header *hdr;
1340
1341 /*
1342 * skip empty records - but NOT for persistent databases:
1343 *
1344 * The record-by-record mode of recovery deletes empty records.
1345 * For persistent databases, this can lead to data corruption
1346 * by deleting records that should be there:
1347 *
1348 * - Assume the cluster has been running for a while.
1349 *
1350 * - A record R in a persistent database has been created and
1351 * deleted a couple of times, the last operation being deletion,
1352 * leaving an empty record with a high RSN, say 10.
1353 *
1354 * - Now a node N is turned off.
1355 *
1356 * - This leaves the local database copy of D on N with the empty
1357 * copy of R and RSN 10. On all other nodes, the recovery has deleted
1358 * the copy of record R.
1359 *
1360 * - Now the record is created again while node N is turned off.
1361 * This creates R with RSN = 1 on all nodes except for N.
1362 *
1363 * - Now node N is turned on again. The following recovery will chose
1364 * the older empty copy of R due to RSN 10 > RSN 1.
1365 *
1366 * ==> Hence the record is gone after the recovery.
1367 *
1368 * On databases like Samba's registry, this can damage the higher-level
1369 * data structures built from the various tdb-level records.
1370 */
1371 if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1372 return 0;
1373 }
1374
1375 /* update the dmaster field to point to us */
1376 hdr = (struct ctdb_ltdb_header *)data.dptr;
1377 if (!params->persistent) {
1378 hdr->dmaster = params->ctdb->pnn;
1379 hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1380 }
1381
1382 /* add the record to the blob ready to send to the nodes */
1383 recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1384 if (recdata == NULL) {
1385 params->failed = true;
1386 return -1;
1387 }
1388 if (params->len + recdata->length >= params->allocated_len) {
1389 params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1390 params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1391 }
1392 if (params->recdata == NULL) {
1393 DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1394 recdata->length + params->len));
1395 params->failed = true;
1396 return -1;
1397 }
1398 params->recdata->count++;
1399 memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1400 params->len += recdata->length;
1401 talloc_free(recdata);
1402
1403 return 0;
1404}
1405
1406/*
1407 push the recdb database out to all nodes
1408 */
1409static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1410 bool persistent,
1411 struct tdb_wrap *recdb, struct ctdb_node_map_old *nodemap)
1412{
1413 struct recdb_data params;
1414 struct ctdb_marshall_buffer *recdata;
1415 TDB_DATA outdata;
1416 TALLOC_CTX *tmp_ctx;
1417 uint32_t *nodes;
1418
1419 tmp_ctx = talloc_new(ctdb);
1420 CTDB_NO_MEMORY(ctdb, tmp_ctx);
1421
1422 recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1423 CTDB_NO_MEMORY(ctdb, recdata);
1424
1425 recdata->db_id = dbid;
1426
1427 params.ctdb = ctdb;
1428 params.recdata = recdata;
1429 params.len = offsetof(struct ctdb_marshall_buffer, data);
1430 params.allocated_len = params.len;
1431 params.failed = false;
1432 params.persistent = persistent;
1433
1434 if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1435 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1436 talloc_free(params.recdata);
1437 talloc_free(tmp_ctx);
1438 return -1;
1439 }
1440
1441 if (params.failed) {
1442 DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1443 talloc_free(params.recdata);
1444 talloc_free(tmp_ctx);
1445 return -1;
1446 }
1447
1448 recdata = params.recdata;
1449
1450 outdata.dptr = (void *)recdata;
1451 outdata.dsize = params.len;
1452
1453 nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1454 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1455 nodes, 0,
1456 CONTROL_TIMEOUT(), false, outdata,
1457 NULL, NULL,
1458 NULL) != 0) {
1459 DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1460 talloc_free(recdata);
1461 talloc_free(tmp_ctx);
1462 return -1;
1463 }
1464
1465 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1466 dbid, recdata->count));
1467
1468 talloc_free(recdata);
1469 talloc_free(tmp_ctx);
1470
1471 return 0;
1472}
1473
1474
1475/*
1476 go through a full recovery on one database
1477 */
1478static int recover_database(struct ctdb_recoverd *rec,
1479 TALLOC_CTX *mem_ctx,
1480 uint32_t dbid,
1481 bool persistent,
1482 uint32_t pnn,
1483 struct ctdb_node_map_old *nodemap,
1484 uint32_t transaction_id)
1485{
1486 struct tdb_wrap *recdb;
1487 int ret;
1488 struct ctdb_context *ctdb = rec->ctdb;
1489 TDB_DATA data;
1490 struct ctdb_transdb w;
1491 uint32_t *nodes;
1492
1493 recdb = create_recdb(ctdb, mem_ctx);
1494 if (recdb == NULL) {
1495 return -1;
1496 }
1497
1498 /* pull all remote databases onto the recdb */
1499 ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1500 if (ret != 0) {
1501 DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1502 return -1;
1503 }
1504
1505 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1506
1507 /* wipe all the remote databases. This is safe as we are in a transaction */
1508 w.db_id = dbid;
1509 w.tid = transaction_id;
1510
1511 data.dptr = (void *)&w;
1512 data.dsize = sizeof(w);
1513
1514 nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1515 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1516 nodes, 0,
1517 CONTROL_TIMEOUT(), false, data,
1518 NULL, NULL,
1519 NULL) != 0) {
1520 DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1521 talloc_free(recdb);
1522 return -1;
1523 }
1524
1525 /* push out the correct database. This sets the dmaster and skips
1526 the empty records */
1527 ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1528 if (ret != 0) {
1529 talloc_free(recdb);
1530 return -1;
1531 }
1532
1533 /* all done with this database */
1534 talloc_free(recdb);
1535
1536 return 0;
1537}
1538
1539/* when we start a recovery, make sure all nodes use the same reclock file
1540 setting
1541*/
1542static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1543{
1544 struct ctdb_context *ctdb = rec->ctdb;
1545 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1546 TDB_DATA data;
1547 uint32_t *nodes;
1548
1549 if (ctdb->recovery_lock_file == NULL) {
1550 data.dptr = NULL;
1551 data.dsize = 0;
1552 } else {
1553 data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1554 data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1555 }
1556
1557 nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1558 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1559 nodes, 0,
1560 CONTROL_TIMEOUT(),
1561 false, data,
1562 NULL, NULL,
1563 rec) != 0) {
1564 DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1565 talloc_free(tmp_ctx);
1566 return -1;
1567 }
1568
1569 talloc_free(tmp_ctx);
1570 return 0;
1571}
1572
1573
1574/*
1575 * this callback is called for every node that failed to execute ctdb_takeover_run()
1576 * and set flag to re-run takeover run.
1577 */
1578static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
1579{
1580 DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1581
1582 if (callback_data != NULL) {
1583 struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1584
1585 DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1586
1587 ctdb_set_culprit(rec, node_pnn);
1588 }
1589}
1590
1591
1592static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
1593{
1594 struct ctdb_context *ctdb = rec->ctdb;
1595 int i;
1596 struct ctdb_banning_state *ban_state;
1597
1598 *self_ban = false;
1599 for (i=0; i<ctdb->num_nodes; i++) {
1600 if (ctdb->nodes[i]->ban_state == NULL) {
1601 continue;
1602 }
1603 ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1604 if (ban_state->count < 2*ctdb->num_nodes) {
1605 continue;
1606 }
1607
1608 DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1609 ctdb->nodes[i]->pnn, ban_state->count,
1610 ctdb->tunable.recovery_ban_period));
1611 ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1612 ban_state->count = 0;
1613
1614 /* Banning ourself? */
1615 if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1616 *self_ban = true;
1617 }
1618 }
1619}
1620
1621static bool do_takeover_run(struct ctdb_recoverd *rec,
1622 struct ctdb_node_map_old *nodemap,
1623 bool banning_credits_on_fail)
1624{
1625 uint32_t *nodes = NULL;
1626 struct ctdb_disable_message dtr;
1627 TDB_DATA data;
1628 int i;
1629 uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1630 int ret;
1631 bool ok;
1632
1633 DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1634
1635 if (ctdb_op_is_in_progress(rec->takeover_run)) {
1636 DEBUG(DEBUG_ERR, (__location__
1637 " takeover run already in progress \n"));
1638 ok = false;
1639 goto done;
1640 }
1641
1642 if (!ctdb_op_begin(rec->takeover_run)) {
1643 ok = false;
1644 goto done;
1645 }
1646
1647 /* Disable IP checks (takeover runs, really) on other nodes
1648 * while doing this takeover run. This will stop those other
1649 * nodes from triggering takeover runs when think they should
1650 * be hosting an IP but it isn't yet on an interface. Don't
1651 * wait for replies since a failure here might cause some
1652 * noise in the logs but will not actually cause a problem.
1653 */
1654 ZERO_STRUCT(dtr);
1655 dtr.srvid = 0; /* No reply */
1656 dtr.pnn = -1;
1657
1658 data.dptr = (uint8_t*)&dtr;
1659 data.dsize = sizeof(dtr);
1660
1661 nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1662
1663 /* Disable for 60 seconds. This can be a tunable later if
1664 * necessary.
1665 */
1666 dtr.timeout = 60;
1667 for (i = 0; i < talloc_array_length(nodes); i++) {
1668 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1669 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1670 data) != 0) {
1671 DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1672 }
1673 }
1674
1675 ret = ctdb_takeover_run(rec->ctdb, nodemap,
1676 rec->force_rebalance_nodes,
1677 takeover_fail_callback,
1678 banning_credits_on_fail ? rec : NULL);
1679
1680 /* Reenable takeover runs and IP checks on other nodes */
1681 dtr.timeout = 0;
1682 for (i = 0; i < talloc_array_length(nodes); i++) {
1683 if (ctdb_client_send_message(rec->ctdb, nodes[i],
1684 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1685 data) != 0) {
1686 DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1687 }
1688 }
1689
1690 if (ret != 0) {
1691 DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1692 ok = false;
1693 goto done;
1694 }
1695
1696 ok = true;
1697 /* Takeover run was successful so clear force rebalance targets */
1698 if (rebalance_nodes == rec->force_rebalance_nodes) {
1699 TALLOC_FREE(rec->force_rebalance_nodes);
1700 } else {
1701 DEBUG(DEBUG_WARNING,
1702 ("Rebalance target nodes changed during takeover run - not clearing\n"));
1703 }
1704done:
1705 rec->need_takeover_run = !ok;
1706 talloc_free(nodes);
1707 ctdb_op_end(rec->takeover_run);
1708
1709 DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1710 return ok;
1711}
1712
1713struct recovery_helper_state {
1714 int fd[2];
1715 pid_t pid;
1716 int result;
1717 bool done;
1718};
1719
1720static void ctdb_recovery_handler(struct tevent_context *ev,
1721 struct tevent_fd *fde,
1722 uint16_t flags, void *private_data)
1723{
1724 struct recovery_helper_state *state = talloc_get_type_abort(
1725 private_data, struct recovery_helper_state);
1726 int ret;
1727
1728 ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1729 if (ret != sizeof(state->result)) {
1730 state->result = EPIPE;
1731 }
1732
1733 state->done = true;
1734}
1735
1736
1737static int db_recovery_parallel(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx)
1738{
1739 static char prog[PATH_MAX+1] = "";
1740 const char **args;
1741 struct recovery_helper_state *state;
1742 struct tevent_fd *fde;
1743 int nargs, ret;
1744
1745 if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1746 "CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1747 "ctdb_recovery_helper")) {
1748 ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1749 }
1750
1751 state = talloc_zero(mem_ctx, struct recovery_helper_state);
1752 if (state == NULL) {
1753 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1754 return -1;
1755 }
1756
1757 state->pid = -1;
1758
1759 ret = pipe(state->fd);
1760 if (ret != 0) {
1761 DEBUG(DEBUG_ERR,
1762 ("Failed to create pipe for recovery helper\n"));
1763 goto fail;
1764 }
1765
1766 set_close_on_exec(state->fd[0]);
1767
1768 nargs = 4;
1769 args = talloc_array(state, const char *, nargs);
1770 if (args == NULL) {
1771 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1772 goto fail;
1773 }
1774
1775 args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1776 args[1] = rec->ctdb->daemon.name;
1777 args[2] = talloc_asprintf(args, "%u", new_generation());
1778 args[3] = NULL;
1779
1780 if (args[0] == NULL || args[2] == NULL) {
1781 DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1782 goto fail;
1783 }
1784
1785 setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1786
1787 if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1788 args, NULL, NULL, &state->pid)) {
1789 DEBUG(DEBUG_ERR,
1790 ("Failed to create child for recovery helper\n"));
1791 goto fail;
1792 }
1793
1794 close(state->fd[1]);
1795 state->fd[1] = -1;
1796
1797 state->done = false;
1798
1799 fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1800 TEVENT_FD_READ, ctdb_recovery_handler, state);
1801 if (fde == NULL) {
1802 goto fail;
1803 }
1804 tevent_fd_set_auto_close(fde);
1805
1806 while (!state->done) {
1807 tevent_loop_once(rec->ctdb->ev);
1808 }
1809
1810 close(state->fd[0]);
1811 state->fd[0] = -1;
1812
1813 if (state->result != 0) {
1814 goto fail;
1815 }
1816
1817 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1818 talloc_free(state);
1819 return 0;
1820
1821fail:
1822 if (state->fd[0] != -1) {
1823 close(state->fd[0]);
1824 }
1825 if (state->fd[1] != -1) {
1826 close(state->fd[1]);
1827 }
1828 if (state->pid != -1) {
1829 ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1830 }
1831 talloc_free(state);
1832 return -1;
1833}
1834
1835static int db_recovery_serial(struct ctdb_recoverd *rec, TALLOC_CTX *mem_ctx,
1836 uint32_t pnn, struct ctdb_node_map_old *nodemap,
1837 struct ctdb_vnn_map *vnnmap,
1838 struct ctdb_dbid_map_old *dbmap)
1839{
1840 struct ctdb_context *ctdb = rec->ctdb;
1841 uint32_t generation;
1842 TDB_DATA data;
1843 uint32_t *nodes;
1844 int ret, i, j;
1845
1846 /* set recovery mode to active on all nodes */
1847 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1848 if (ret != 0) {
1849 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1850 return -1;
1851 }
1852
1853 /* execute the "startrecovery" event script on all nodes */
1854 ret = run_startrecovery_eventscript(rec, nodemap);
1855 if (ret!=0) {
1856 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1857 return -1;
1858 }
1859
1860 /* pick a new generation number */
1861 generation = new_generation();
1862
1863 /* change the vnnmap on this node to use the new generation
1864 number but not on any other nodes.
1865 this guarantees that if we abort the recovery prematurely
1866 for some reason (a node stops responding?)
1867 that we can just return immediately and we will reenter
1868 recovery shortly again.
1869 I.e. we deliberately leave the cluster with an inconsistent
1870 generation id to allow us to abort recovery at any stage and
1871 just restart it from scratch.
1872 */
1873 vnnmap->generation = generation;
1874 ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1875 if (ret != 0) {
1876 DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1877 return -1;
1878 }
1879
1880 /* Database generations are updated when the transaction is commited to
1881 * the databases. So make sure to use the final generation as the
1882 * transaction id
1883 */
1884 generation = new_generation();
1885
1886 data.dptr = (void *)&generation;
1887 data.dsize = sizeof(uint32_t);
1888
1889 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1890 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1891 nodes, 0,
1892 CONTROL_TIMEOUT(), false, data,
1893 NULL,
1894 transaction_start_fail_callback,
1895 rec) != 0) {
1896 DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1897 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1898 nodes, 0,
1899 CONTROL_TIMEOUT(), false, tdb_null,
1900 NULL,
1901 NULL,
1902 NULL) != 0) {
1903 DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1904 }
1905 return -1;
1906 }
1907
1908 DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1909
1910 for (i=0;i<dbmap->num;i++) {
1911 ret = recover_database(rec, mem_ctx,
1912 dbmap->dbs[i].db_id,
1913 dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1914 pnn, nodemap, generation);
1915 if (ret != 0) {
1916 DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1917 return -1;
1918 }
1919 }
1920
1921 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1922
1923 /* commit all the changes */
1924 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1925 nodes, 0,
1926 CONTROL_TIMEOUT(), false, data,
1927 NULL, NULL,
1928 NULL) != 0) {
1929 DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1930 return -1;
1931 }
1932
1933 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1934
1935 /* build a new vnn map with all the currently active and
1936 unbanned nodes */
1937 vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1938 CTDB_NO_MEMORY(ctdb, vnnmap);
1939 vnnmap->generation = generation;
1940 vnnmap->size = 0;
1941 vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1942 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1943 for (i=j=0;i<nodemap->num;i++) {
1944 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1945 continue;
1946 }
1947 if (!ctdb_node_has_capabilities(rec->caps,
1948 ctdb->nodes[i]->pnn,
1949 CTDB_CAP_LMASTER)) {
1950 /* this node can not be an lmaster */
1951 DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1952 continue;
1953 }
1954
1955 vnnmap->size++;
1956 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1957 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1958 vnnmap->map[j++] = nodemap->nodes[i].pnn;
1959
1960 }
1961 if (vnnmap->size == 0) {
1962 DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1963 vnnmap->size++;
1964 vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1965 CTDB_NO_MEMORY(ctdb, vnnmap->map);
1966 vnnmap->map[0] = pnn;
1967 }
1968
1969 /* update to the new vnnmap on all nodes */
1970 ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1971 if (ret != 0) {
1972 DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1973 return -1;
1974 }
1975
1976 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1977
1978 /* disable recovery mode */
1979 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
1980 if (ret != 0) {
1981 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1982 return -1;
1983 }
1984
1985 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1986
1987 /* execute the "recovered" event script on all nodes */
1988 ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1989 if (ret!=0) {
1990 DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1991 return -1;
1992 }
1993
1994 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1995
1996 return 0;
1997}
1998
1999/*
2000 we are the recmaster, and recovery is needed - start a recovery run
2001 */
2002static int do_recovery(struct ctdb_recoverd *rec,
2003 TALLOC_CTX *mem_ctx, uint32_t pnn,
2004 struct ctdb_node_map_old *nodemap, struct ctdb_vnn_map *vnnmap)
2005{
2006 struct ctdb_context *ctdb = rec->ctdb;
2007 int i, ret;
2008 struct ctdb_dbid_map_old *dbmap;
2009 struct timeval start_time;
2010 bool self_ban;
2011 bool par_recovery;
2012
2013 DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2014
2015 /* Check if the current node is still the recmaster. It's possible that
2016 * re-election has changed the recmaster.
2017 */
2018 if (pnn != rec->recmaster) {
2019 DEBUG(DEBUG_NOTICE,
2020 ("Recovery master changed to %u, aborting recovery\n",
2021 rec->recmaster));
2022 return -1;
2023 }
2024
2025 /* if recovery fails, force it again */
2026 rec->need_recovery = true;
2027
2028 if (!ctdb_op_begin(rec->recovery)) {
2029 return -1;
2030 }
2031
2032 if (rec->election_timeout) {
2033 /* an election is in progress */
2034 DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2035 goto fail;
2036 }
2037
2038 ban_misbehaving_nodes(rec, &self_ban);
2039 if (self_ban) {
2040 DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2041 goto fail;
2042 }
2043
2044 if (ctdb->recovery_lock_file != NULL) {
2045 if (ctdb_recovery_have_lock(ctdb)) {
2046 DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2047 } else {
2048 start_time = timeval_current();
2049 DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2050 ctdb->recovery_lock_file));
2051 if (!ctdb_recovery_lock(ctdb)) {
2052 if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2053 /* If ctdb is trying first recovery, it's
2054 * possible that current node does not know
2055 * yet who the recmaster is.
2056 */
2057 DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2058 " - retrying recovery\n"));
2059 goto fail;
2060 }
2061
2062 DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2063 "and ban ourself for %u seconds\n",
2064 ctdb->tunable.recovery_ban_period));
2065 ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2066 goto fail;
2067 }
2068 ctdb_ctrl_report_recd_lock_latency(ctdb,
2069 CONTROL_TIMEOUT(),
2070 timeval_elapsed(&start_time));
2071 DEBUG(DEBUG_NOTICE,
2072 ("Recovery lock taken successfully by recovery daemon\n"));
2073 }
2074 }
2075
2076 DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2077
2078 /* get a list of all databases */
2079 ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2080 if (ret != 0) {
2081 DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2082 goto fail;
2083 }
2084
2085 /* we do the db creation before we set the recovery mode, so the freeze happens
2086 on all databases we will be dealing with. */
2087
2088 /* verify that we have all the databases any other node has */
2089 ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2090 if (ret != 0) {
2091 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2092 goto fail;
2093 }
2094
2095 /* verify that all other nodes have all our databases */
2096 ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2097 if (ret != 0) {
2098 DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2099 goto fail;
2100 }
2101 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2102
2103 /* update the database priority for all remote databases */
2104 ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2105 if (ret != 0) {
2106 DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2107 }
2108 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2109
2110
2111 /* update all other nodes to use the same setting for reclock files
2112 as the local recovery master.
2113 */
2114 sync_recovery_lock_file_across_cluster(rec);
2115
2116 /* Retrieve capabilities from all connected nodes */
2117 ret = update_capabilities(rec, nodemap);
2118 if (ret!=0) {
2119 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2120 return -1;
2121 }
2122
2123 /*
2124 update all nodes to have the same flags that we have
2125 */
2126 for (i=0;i<nodemap->num;i++) {
2127 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2128 continue;
2129 }
2130
2131 ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2132 if (ret != 0) {
2133 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2134 DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2135 } else {
2136 DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2137 return -1;
2138 }
2139 }
2140 }
2141
2142 DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2143
2144 /* Check if all participating nodes have parallel recovery capability */
2145 par_recovery = true;
2146 for (i=0; i<nodemap->num; i++) {
2147 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2148 continue;
2149 }
2150
2151 if (!(rec->caps[i].capabilities &
2152 CTDB_CAP_PARALLEL_RECOVERY)) {
2153 par_recovery = false;
2154 break;
2155 }
2156 }
2157
2158 if (par_recovery) {
2159 ret = db_recovery_parallel(rec, mem_ctx);
2160 } else {
2161 ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2162 dbmap);
2163 }
2164
2165 if (ret != 0) {
2166 goto fail;
2167 }
2168
2169 do_takeover_run(rec, nodemap, false);
2170
2171 /* send a message to all clients telling them that the cluster
2172 has been reconfigured */
2173 ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2174 CTDB_SRVID_RECONFIGURE, tdb_null);
2175 if (ret != 0) {
2176 DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2177 goto fail;
2178 }
2179
2180 DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2181
2182 rec->need_recovery = false;
2183 ctdb_op_end(rec->recovery);
2184
2185 /* we managed to complete a full recovery, make sure to forgive
2186 any past sins by the nodes that could now participate in the
2187 recovery.
2188 */
2189 DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2190 for (i=0;i<nodemap->num;i++) {
2191 struct ctdb_banning_state *ban_state;
2192
2193 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2194 continue;
2195 }
2196
2197 ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2198 if (ban_state == NULL) {
2199 continue;
2200 }
2201
2202 ban_state->count = 0;
2203 }
2204
2205 /* We just finished a recovery successfully.
2206 We now wait for rerecovery_timeout before we allow
2207 another recovery to take place.
2208 */
2209 DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2210 ctdb_op_disable(rec->recovery, ctdb->ev,
2211 ctdb->tunable.rerecovery_timeout);
2212 return 0;
2213
2214fail:
2215 ctdb_op_end(rec->recovery);
2216 return -1;
2217}
2218
2219
2220/*
2221 elections are won by first checking the number of connected nodes, then
2222 the priority time, then the pnn
2223 */
2224struct election_message {
2225 uint32_t num_connected;
2226 struct timeval priority_time;
2227 uint32_t pnn;
2228 uint32_t node_flags;
2229};
2230
2231/*
2232 form this nodes election data
2233 */
2234static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
2235{
2236 int ret, i;
2237 struct ctdb_node_map_old *nodemap;
2238 struct ctdb_context *ctdb = rec->ctdb;
2239
2240 ZERO_STRUCTP(em);
2241
2242 em->pnn = rec->ctdb->pnn;
2243 em->priority_time = rec->priority_time;
2244
2245 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2246 if (ret != 0) {
2247 DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2248 return;
2249 }
2250
2251 rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2252 em->node_flags = rec->node_flags;
2253
2254 for (i=0;i<nodemap->num;i++) {
2255 if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2256 em->num_connected++;
2257 }
2258 }
2259
2260 /* we shouldnt try to win this election if we cant be a recmaster */
2261 if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2262 em->num_connected = 0;
2263 em->priority_time = timeval_current();
2264 }
2265
2266 talloc_free(nodemap);
2267}
2268
2269/*
2270 see if the given election data wins
2271 */
2272static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
2273{
2274 struct election_message myem;
2275 int cmp = 0;
2276
2277 ctdb_election_data(rec, &myem);
2278
2279 /* we cant win if we don't have the recmaster capability */
2280 if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2281 return false;
2282 }
2283
2284 /* we cant win if we are banned */
2285 if (rec->node_flags & NODE_FLAGS_BANNED) {
2286 return false;
2287 }
2288
2289 /* we cant win if we are stopped */
2290 if (rec->node_flags & NODE_FLAGS_STOPPED) {
2291 return false;
2292 }
2293
2294 /* we will automatically win if the other node is banned */
2295 if (em->node_flags & NODE_FLAGS_BANNED) {
2296 return true;
2297 }
2298
2299 /* we will automatically win if the other node is banned */
2300 if (em->node_flags & NODE_FLAGS_STOPPED) {
2301 return true;
2302 }
2303
2304 /* then the longest running node */
2305 if (cmp == 0) {
2306 cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2307 }
2308
2309 if (cmp == 0) {
2310 cmp = (int)myem.pnn - (int)em->pnn;
2311 }
2312
2313 return cmp > 0;
2314}
2315
2316/*
2317 send out an election request
2318 */
2319static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2320{
2321 int ret;
2322 TDB_DATA election_data;
2323 struct election_message emsg;
2324 uint64_t srvid;
2325 struct ctdb_context *ctdb = rec->ctdb;
2326
2327 srvid = CTDB_SRVID_ELECTION;
2328
2329 ctdb_election_data(rec, &emsg);
2330
2331 election_data.dsize = sizeof(struct election_message);
2332 election_data.dptr = (unsigned char *)&emsg;
2333
2334
2335 /* first we assume we will win the election and set
2336 recoverymaster to be ourself on the current node
2337 */
2338 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2339 CTDB_CURRENT_NODE, pnn);
2340 if (ret != 0) {
2341 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2342 return -1;
2343 }
2344 rec->recmaster = pnn;
2345
2346 /* send an election message to all active nodes */
2347 DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2348 return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2349}
2350
2351/*
2352 we think we are winning the election - send a broadcast election request
2353 */
2354static void election_send_request(struct tevent_context *ev,
2355 struct tevent_timer *te,
2356 struct timeval t, void *p)
2357{
2358 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2359 int ret;
2360
2361 ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2362 if (ret != 0) {
2363 DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2364 }
2365
2366 TALLOC_FREE(rec->send_election_te);
2367}
2368
2369/*
2370 handler for memory dumps
2371*/
2372static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2373{
2374 struct ctdb_recoverd *rec = talloc_get_type(
2375 private_data, struct ctdb_recoverd);
2376 struct ctdb_context *ctdb = rec->ctdb;
2377 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2378 TDB_DATA *dump;
2379 int ret;
2380 struct ctdb_srvid_message *rd;
2381
2382 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2383 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2384 talloc_free(tmp_ctx);
2385 return;
2386 }
2387 rd = (struct ctdb_srvid_message *)data.dptr;
2388
2389 dump = talloc_zero(tmp_ctx, TDB_DATA);
2390 if (dump == NULL) {
2391 DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2392 talloc_free(tmp_ctx);
2393 return;
2394 }
2395 ret = ctdb_dump_memory(ctdb, dump);
2396 if (ret != 0) {
2397 DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2398 talloc_free(tmp_ctx);
2399 return;
2400 }
2401
2402DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2403
2404 ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2405 if (ret != 0) {
2406 DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2407 talloc_free(tmp_ctx);
2408 return;
2409 }
2410
2411 talloc_free(tmp_ctx);
2412}
2413
2414/*
2415 handler for reload_nodes
2416*/
2417static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2418 void *private_data)
2419{
2420 struct ctdb_recoverd *rec = talloc_get_type(
2421 private_data, struct ctdb_recoverd);
2422
2423 DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2424
2425 ctdb_load_nodes_file(rec->ctdb);
2426}
2427
2428
2429static void ctdb_rebalance_timeout(struct tevent_context *ev,
2430 struct tevent_timer *te,
2431 struct timeval t, void *p)
2432{
2433 struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2434
2435 if (rec->force_rebalance_nodes == NULL) {
2436 DEBUG(DEBUG_ERR,
2437 ("Rebalance timeout occurred - no nodes to rebalance\n"));
2438 return;
2439 }
2440
2441 DEBUG(DEBUG_NOTICE,
2442 ("Rebalance timeout occurred - trigger takeover run\n"));
2443 rec->need_takeover_run = true;
2444}
2445
2446
2447static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2448 void *private_data)
2449{
2450 struct ctdb_recoverd *rec = talloc_get_type(
2451 private_data, struct ctdb_recoverd);
2452 struct ctdb_context *ctdb = rec->ctdb;
2453 uint32_t pnn;
2454 uint32_t *t;
2455 int len;
2456 uint32_t deferred_rebalance;
2457
2458 if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2459 return;
2460 }
2461
2462 if (data.dsize != sizeof(uint32_t)) {
2463 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2464 return;
2465 }
2466
2467 pnn = *(uint32_t *)&data.dptr[0];
2468
2469 DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2470
2471 /* Copy any existing list of nodes. There's probably some
2472 * sort of realloc variant that will do this but we need to
2473 * make sure that freeing the old array also cancels the timer
2474 * event for the timeout... not sure if realloc will do that.
2475 */
2476 len = (rec->force_rebalance_nodes != NULL) ?
2477 talloc_array_length(rec->force_rebalance_nodes) :
2478 0;
2479
2480 /* This allows duplicates to be added but they don't cause
2481 * harm. A call to add a duplicate PNN arguably means that
2482 * the timeout should be reset, so this is the simplest
2483 * solution.
2484 */
2485 t = talloc_zero_array(rec, uint32_t, len+1);
2486 CTDB_NO_MEMORY_VOID(ctdb, t);
2487 if (len > 0) {
2488 memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2489 }
2490 t[len] = pnn;
2491
2492 talloc_free(rec->force_rebalance_nodes);
2493
2494 rec->force_rebalance_nodes = t;
2495
2496 /* If configured, setup a deferred takeover run to make sure
2497 * that certain nodes get IPs rebalanced to them. This will
2498 * be cancelled if a successful takeover run happens before
2499 * the timeout. Assign tunable value to variable for
2500 * readability.
2501 */
2502 deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2503 if (deferred_rebalance != 0) {
2504 tevent_add_timer(ctdb->ev, rec->force_rebalance_nodes,
2505 timeval_current_ofs(deferred_rebalance, 0),
2506 ctdb_rebalance_timeout, rec);
2507 }
2508}
2509
2510
2511
2512static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
2513 void *private_data)
2514{
2515 struct ctdb_recoverd *rec = talloc_get_type(
2516 private_data, struct ctdb_recoverd);
2517 struct ctdb_public_ip *ip;
2518
2519 if (rec->recmaster != rec->ctdb->pnn) {
2520 DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2521 return;
2522 }
2523
2524 if (data.dsize != sizeof(struct ctdb_public_ip)) {
2525 DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2526 return;
2527 }
2528
2529 ip = (struct ctdb_public_ip *)data.dptr;
2530
2531 update_ip_assignment_tree(rec->ctdb, ip);
2532}
2533
2534static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2535 TDB_DATA data,
2536 struct ctdb_op_state *op_state)
2537{
2538 struct ctdb_disable_message *r;
2539 uint32_t timeout;
2540 TDB_DATA result;
2541 int32_t ret = 0;
2542
2543 /* Validate input data */
2544 if (data.dsize != sizeof(struct ctdb_disable_message)) {
2545 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2546 "expecting %lu\n", (long unsigned)data.dsize,
2547 (long unsigned)sizeof(struct ctdb_srvid_message)));
2548 return;
2549 }
2550 if (data.dptr == NULL) {
2551 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2552 return;
2553 }
2554
2555 r = (struct ctdb_disable_message *)data.dptr;
2556 timeout = r->timeout;
2557
2558 ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2559 if (ret != 0) {
2560 goto done;
2561 }
2562
2563 /* Returning our PNN tells the caller that we succeeded */
2564 ret = ctdb_get_pnn(ctdb);
2565done:
2566 result.dsize = sizeof(int32_t);
2567 result.dptr = (uint8_t *)&ret;
2568 srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2569}
2570
2571static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2572 void *private_data)
2573{
2574 struct ctdb_recoverd *rec = talloc_get_type(
2575 private_data, struct ctdb_recoverd);
2576
2577 srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2578}
2579
2580/* Backward compatibility for this SRVID */
2581static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2582 void *private_data)
2583{
2584 struct ctdb_recoverd *rec = talloc_get_type(
2585 private_data, struct ctdb_recoverd);
2586 uint32_t timeout;
2587
2588 if (data.dsize != sizeof(uint32_t)) {
2589 DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2590 "expecting %lu\n", (long unsigned)data.dsize,
2591 (long unsigned)sizeof(uint32_t)));
2592 return;
2593 }
2594 if (data.dptr == NULL) {
2595 DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2596 return;
2597 }
2598
2599 timeout = *((uint32_t *)data.dptr);
2600
2601 ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2602}
2603
2604static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2605 void *private_data)
2606{
2607 struct ctdb_recoverd *rec = talloc_get_type(
2608 private_data, struct ctdb_recoverd);
2609
2610 srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2611}
2612
2613/*
2614 handler for ip reallocate, just add it to the list of requests and
2615 handle this later in the monitor_cluster loop so we do not recurse
2616 with other requests to takeover_run()
2617*/
2618static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2619 void *private_data)
2620{
2621 struct ctdb_srvid_message *request;
2622 struct ctdb_recoverd *rec = talloc_get_type(
2623 private_data, struct ctdb_recoverd);
2624
2625 if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2626 DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2627 return;
2628 }
2629
2630 request = (struct ctdb_srvid_message *)data.dptr;
2631
2632 srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2633}
2634
2635static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2636 struct ctdb_recoverd *rec)
2637{
2638 TDB_DATA result;
2639 int32_t ret;
2640 struct srvid_requests *current;
2641
2642 DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2643
2644 /* Only process requests that are currently pending. More
2645 * might come in while the takeover run is in progress and
2646 * they will need to be processed later since they might
2647 * be in response flag changes.
2648 */
2649 current = rec->reallocate_requests;
2650 rec->reallocate_requests = NULL;
2651
2652 if (do_takeover_run(rec, rec->nodemap, false)) {
2653 ret = ctdb_get_pnn(ctdb);
2654 } else {
2655 ret = -1;
2656 }
2657
2658 result.dsize = sizeof(int32_t);
2659 result.dptr = (uint8_t *)&ret;
2660
2661 srvid_requests_reply(ctdb, &current, result);
2662}
2663
2664/*
2665 * handler for assigning banning credits
2666 */
2667static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2668{
2669 struct ctdb_recoverd *rec = talloc_get_type(
2670 private_data, struct ctdb_recoverd);
2671 uint32_t ban_pnn;
2672
2673 /* Ignore if we are not recmaster */
2674 if (rec->ctdb->pnn != rec->recmaster) {
2675 return;
2676 }
2677
2678 if (data.dsize != sizeof(uint32_t)) {
2679 DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
2680 data.dsize));
2681 return;
2682 }
2683
2684 ban_pnn = *(uint32_t *)data.dptr;
2685
2686 ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
2687}
2688
2689/*
2690 handler for recovery master elections
2691*/
2692static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2693{
2694 struct ctdb_recoverd *rec = talloc_get_type(
2695 private_data, struct ctdb_recoverd);
2696 struct ctdb_context *ctdb = rec->ctdb;
2697 int ret;
2698 struct election_message *em = (struct election_message *)data.dptr;
2699
2700 /* Ignore election packets from ourself */
2701 if (ctdb->pnn == em->pnn) {
2702 return;
2703 }
2704
2705 /* we got an election packet - update the timeout for the election */
2706 talloc_free(rec->election_timeout);
2707 rec->election_timeout = tevent_add_timer(
2708 ctdb->ev, ctdb,
2709 fast_start ?
2710 timeval_current_ofs(0, 500000) :
2711 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2712 ctdb_election_timeout, rec);
2713
2714 /* someone called an election. check their election data
2715 and if we disagree and we would rather be the elected node,
2716 send a new election message to all other nodes
2717 */
2718 if (ctdb_election_win(rec, em)) {
2719 if (!rec->send_election_te) {
2720 rec->send_election_te = tevent_add_timer(
2721 ctdb->ev, rec,
2722 timeval_current_ofs(0, 500000),
2723 election_send_request, rec);
2724 }
2725 return;
2726 }
2727
2728 /* we didn't win */
2729 TALLOC_FREE(rec->send_election_te);
2730
2731 /* Release the recovery lock file */
2732 if (ctdb_recovery_have_lock(ctdb)) {
2733 ctdb_recovery_unlock(ctdb);
2734 }
2735
2736 clear_ip_assignment_tree(ctdb);
2737
2738 /* ok, let that guy become recmaster then */
2739 ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2740 CTDB_CURRENT_NODE, em->pnn);
2741 if (ret != 0) {
2742 DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2743 return;
2744 }
2745 rec->recmaster = em->pnn;
2746
2747 return;
2748}
2749
2750
2751/*
2752 force the start of the election process
2753 */
2754static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2755 struct ctdb_node_map_old *nodemap)
2756{
2757 int ret;
2758 struct ctdb_context *ctdb = rec->ctdb;
2759
2760 DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2761
2762 /* set all nodes to recovery mode to stop all internode traffic */
2763 ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2764 if (ret != 0) {
2765 DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2766 return;
2767 }
2768
2769 talloc_free(rec->election_timeout);
2770 rec->election_timeout = tevent_add_timer(
2771 ctdb->ev, ctdb,
2772 fast_start ?
2773 timeval_current_ofs(0, 500000) :
2774 timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2775 ctdb_election_timeout, rec);
2776
2777 ret = send_election_request(rec, pnn);
2778 if (ret!=0) {
2779 DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2780 return;
2781 }
2782
2783 /* wait for a few seconds to collect all responses */
2784 ctdb_wait_election(rec);
2785}
2786
2787
2788
2789/*
2790 handler for when a node changes its flags
2791*/
2792static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2793{
2794 struct ctdb_recoverd *rec = talloc_get_type(
2795 private_data, struct ctdb_recoverd);
2796 struct ctdb_context *ctdb = rec->ctdb;
2797 int ret;
2798 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2799 struct ctdb_node_map_old *nodemap=NULL;
2800 TALLOC_CTX *tmp_ctx;
2801 int i;
2802 int disabled_flag_changed;
2803
2804 if (data.dsize != sizeof(*c)) {
2805 DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2806 return;
2807 }
2808
2809 tmp_ctx = talloc_new(ctdb);
2810 CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2811
2812 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2813 if (ret != 0) {
2814 DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2815 talloc_free(tmp_ctx);
2816 return;
2817 }
2818
2819
2820 for (i=0;i<nodemap->num;i++) {
2821 if (nodemap->nodes[i].pnn == c->pnn) break;
2822 }
2823
2824 if (i == nodemap->num) {
2825 DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2826 talloc_free(tmp_ctx);
2827 return;
2828 }
2829
2830 if (c->old_flags != c->new_flags) {
2831 DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2832 }
2833
2834 disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2835
2836 nodemap->nodes[i].flags = c->new_flags;
2837
2838 ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2839 CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2840
2841 if (ret == 0 &&
2842 rec->recmaster == ctdb->pnn &&
2843 ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2844 /* Only do the takeover run if the perm disabled or unhealthy
2845 flags changed since these will cause an ip failover but not
2846 a recovery.
2847 If the node became disconnected or banned this will also
2848 lead to an ip address failover but that is handled
2849 during recovery
2850 */
2851 if (disabled_flag_changed) {
2852 rec->need_takeover_run = true;
2853 }
2854 }
2855
2856 talloc_free(tmp_ctx);
2857}
2858
2859/*
2860 handler for when we need to push out flag changes ot all other nodes
2861*/
2862static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2863 void *private_data)
2864{
2865 struct ctdb_recoverd *rec = talloc_get_type(
2866 private_data, struct ctdb_recoverd);
2867 struct ctdb_context *ctdb = rec->ctdb;
2868 int ret;
2869 struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
2870 struct ctdb_node_map_old *nodemap=NULL;
2871 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2872 uint32_t *nodes;
2873
2874 /* read the node flags from the recmaster */
2875 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2876 tmp_ctx, &nodemap);
2877 if (ret != 0) {
2878 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2879 talloc_free(tmp_ctx);
2880 return;
2881 }
2882 if (c->pnn >= nodemap->num) {
2883 DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2884 talloc_free(tmp_ctx);
2885 return;
2886 }
2887
2888 /* send the flags update to all connected nodes */
2889 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2890
2891 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2892 nodes, 0, CONTROL_TIMEOUT(),
2893 false, data,
2894 NULL, NULL,
2895 NULL) != 0) {
2896 DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2897
2898 talloc_free(tmp_ctx);
2899 return;
2900 }
2901
2902 talloc_free(tmp_ctx);
2903}
2904
2905
2906struct verify_recmode_normal_data {
2907 uint32_t count;
2908 enum monitor_result status;
2909};
2910
2911static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2912{
2913 struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2914
2915
2916 /* one more node has responded with recmode data*/
2917 rmdata->count--;
2918
2919 /* if we failed to get the recmode, then return an error and let
2920 the main loop try again.
2921 */
2922 if (state->state != CTDB_CONTROL_DONE) {
2923 if (rmdata->status == MONITOR_OK) {
2924 rmdata->status = MONITOR_FAILED;
2925 }
2926 return;
2927 }
2928
2929 /* if we got a response, then the recmode will be stored in the
2930 status field
2931 */
2932 if (state->status != CTDB_RECOVERY_NORMAL) {
2933 DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2934 rmdata->status = MONITOR_RECOVERY_NEEDED;
2935 }
2936
2937 return;
2938}
2939
2940
2941/* verify that all nodes are in normal recovery mode */
2942static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap)
2943{
2944 struct verify_recmode_normal_data *rmdata;
2945 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2946 struct ctdb_client_control_state *state;
2947 enum monitor_result status;
2948 int j;
2949
2950 rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2951 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2952 rmdata->count = 0;
2953 rmdata->status = MONITOR_OK;
2954
2955 /* loop over all active nodes and send an async getrecmode call to
2956 them*/
2957 for (j=0; j<nodemap->num; j++) {
2958 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2959 continue;
2960 }
2961 state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2962 CONTROL_TIMEOUT(),
2963 nodemap->nodes[j].pnn);
2964 if (state == NULL) {
2965 /* we failed to send the control, treat this as
2966 an error and try again next iteration
2967 */
2968 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2969 talloc_free(mem_ctx);
2970 return MONITOR_FAILED;
2971 }
2972
2973 /* set up the callback functions */
2974 state->async.fn = verify_recmode_normal_callback;
2975 state->async.private_data = rmdata;
2976
2977 /* one more control to wait for to complete */
2978 rmdata->count++;
2979 }
2980
2981
2982 /* now wait for up to the maximum number of seconds allowed
2983 or until all nodes we expect a response from has replied
2984 */
2985 while (rmdata->count > 0) {
2986 tevent_loop_once(ctdb->ev);
2987 }
2988
2989 status = rmdata->status;
2990 talloc_free(mem_ctx);
2991 return status;
2992}
2993
2994
2995struct verify_recmaster_data {
2996 struct ctdb_recoverd *rec;
2997 uint32_t count;
2998 uint32_t pnn;
2999 enum monitor_result status;
3000};
3001
3002static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3003{
3004 struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3005
3006
3007 /* one more node has responded with recmaster data*/
3008 rmdata->count--;
3009
3010 /* if we failed to get the recmaster, then return an error and let
3011 the main loop try again.
3012 */
3013 if (state->state != CTDB_CONTROL_DONE) {
3014 if (rmdata->status == MONITOR_OK) {
3015 rmdata->status = MONITOR_FAILED;
3016 }
3017 return;
3018 }
3019
3020 /* if we got a response, then the recmaster will be stored in the
3021 status field
3022 */
3023 if (state->status != rmdata->pnn) {
3024 DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3025 ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3026 rmdata->status = MONITOR_ELECTION_NEEDED;
3027 }
3028
3029 return;
3030}
3031
3032
3033/* verify that all nodes agree that we are the recmaster */
3034static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map_old *nodemap, uint32_t pnn)
3035{
3036 struct ctdb_context *ctdb = rec->ctdb;
3037 struct verify_recmaster_data *rmdata;
3038 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3039 struct ctdb_client_control_state *state;
3040 enum monitor_result status;
3041 int j;
3042
3043 rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3044 CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3045 rmdata->rec = rec;
3046 rmdata->count = 0;
3047 rmdata->pnn = pnn;
3048 rmdata->status = MONITOR_OK;
3049
3050 /* loop over all active nodes and send an async getrecmaster call to
3051 them*/
3052 for (j=0; j<nodemap->num; j++) {
3053 if (nodemap->nodes[j].pnn == rec->recmaster) {
3054 continue;
3055 }
3056 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3057 continue;
3058 }
3059 state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3060 CONTROL_TIMEOUT(),
3061 nodemap->nodes[j].pnn);
3062 if (state == NULL) {
3063 /* we failed to send the control, treat this as
3064 an error and try again next iteration
3065 */
3066 DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3067 talloc_free(mem_ctx);
3068 return MONITOR_FAILED;
3069 }
3070
3071 /* set up the callback functions */
3072 state->async.fn = verify_recmaster_callback;
3073 state->async.private_data = rmdata;
3074
3075 /* one more control to wait for to complete */
3076 rmdata->count++;
3077 }
3078
3079
3080 /* now wait for up to the maximum number of seconds allowed
3081 or until all nodes we expect a response from has replied
3082 */
3083 while (rmdata->count > 0) {
3084 tevent_loop_once(ctdb->ev);
3085 }
3086
3087 status = rmdata->status;
3088 talloc_free(mem_ctx);
3089 return status;
3090}
3091
3092static bool interfaces_have_changed(struct ctdb_context *ctdb,
3093 struct ctdb_recoverd *rec)
3094{
3095 struct ctdb_iface_list_old *ifaces = NULL;
3096 TALLOC_CTX *mem_ctx;
3097 bool ret = false;
3098
3099 mem_ctx = talloc_new(NULL);
3100
3101 /* Read the interfaces from the local node */
3102 if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3103 CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3104 DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3105 /* We could return an error. However, this will be
3106 * rare so we'll decide that the interfaces have
3107 * actually changed, just in case.
3108 */
3109 talloc_free(mem_ctx);
3110 return true;
3111 }
3112
3113 if (!rec->ifaces) {
3114 /* We haven't been here before so things have changed */
3115 DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3116 ret = true;
3117 } else if (rec->ifaces->num != ifaces->num) {
3118 /* Number of interfaces has changed */
3119 DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3120 rec->ifaces->num, ifaces->num));
3121 ret = true;
3122 } else {
3123 /* See if interface names or link states have changed */
3124 int i;
3125 for (i = 0; i < rec->ifaces->num; i++) {
3126 struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3127 if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3128 DEBUG(DEBUG_NOTICE,
3129 ("Interface in slot %d changed: %s => %s\n",
3130 i, iface->name, ifaces->ifaces[i].name));
3131 ret = true;
3132 break;
3133 }
3134 if (iface->link_state != ifaces->ifaces[i].link_state) {
3135 DEBUG(DEBUG_NOTICE,
3136 ("Interface %s changed state: %d => %d\n",
3137 iface->name, iface->link_state,
3138 ifaces->ifaces[i].link_state));
3139 ret = true;
3140 break;
3141 }
3142 }
3143 }
3144
3145 talloc_free(rec->ifaces);
3146 rec->ifaces = talloc_steal(rec, ifaces);
3147
3148 talloc_free(mem_ctx);
3149 return ret;
3150}
3151
3152/* called to check that the local allocation of public ip addresses is ok.
3153*/
3154static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map_old *nodemap)
3155{
3156 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3157 int ret, j;
3158 bool need_takeover_run = false;
3159
3160 if (interfaces_have_changed(ctdb, rec)) {
3161 DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3162 "local node %u - force takeover run\n",
3163 pnn));
3164 need_takeover_run = true;
3165 }
3166
3167 /* verify that we have the ip addresses we should have
3168 and we don't have ones we shouldnt have.
3169 if we find an inconsistency we set recmode to
3170 active on the local node and wait for the recmaster
3171 to do a full blown recovery.
3172 also if the pnn is -1 and we are healthy and can host the ip
3173 we also request a ip reallocation.
3174 */
3175 if (ctdb->tunable.disable_ip_failover == 0) {
3176 struct ctdb_public_ip_list_old *ips = NULL;
3177
3178 /* read the *available* IPs from the local node */
3179 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3180 if (ret != 0) {
3181 DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3182 talloc_free(mem_ctx);
3183 return -1;
3184 }
3185
3186 for (j=0; j<ips->num; j++) {
3187 if (ips->ips[j].pnn == -1 &&
3188 nodemap->nodes[pnn].flags == 0) {
3189 DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3190 ctdb_addr_to_str(&ips->ips[j].addr)));
3191 need_takeover_run = true;
3192 }
3193 }
3194
3195 talloc_free(ips);
3196
3197 /* read the *known* IPs from the local node */
3198 ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3199 if (ret != 0) {
3200 DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3201 talloc_free(mem_ctx);
3202 return -1;
3203 }
3204
3205 for (j=0; j<ips->num; j++) {
3206 if (ips->ips[j].pnn == pnn) {
3207 if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3208 DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3209 ctdb_addr_to_str(&ips->ips[j].addr)));
3210 need_takeover_run = true;
3211 }
3212 } else {
3213 if (ctdb->do_checkpublicip &&
3214 ctdb_sys_have_ip(&ips->ips[j].addr)) {
3215
3216 DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3217 ctdb_addr_to_str(&ips->ips[j].addr)));
3218
3219 if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3220 DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3221 }
3222 }
3223 }
3224 }
3225 }
3226
3227 if (need_takeover_run) {
3228 struct ctdb_srvid_message rd;
3229 TDB_DATA data;
3230
3231 DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3232
3233 ZERO_STRUCT(rd);
3234 rd.pnn = ctdb->pnn;
3235 rd.srvid = 0;
3236 data.dptr = (uint8_t *)&rd;
3237 data.dsize = sizeof(rd);
3238
3239 ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3240 if (ret != 0) {
3241 DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3242 }
3243 }
3244 talloc_free(mem_ctx);
3245 return 0;
3246}
3247
3248
3249static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
3250{
3251 struct ctdb_node_map_old **remote_nodemaps = callback_data;
3252
3253 if (node_pnn >= ctdb->num_nodes) {
3254 DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3255 return;
3256 }
3257
3258 remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3259
3260}
3261
3262static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
3263 struct ctdb_node_map_old *nodemap,
3264 struct ctdb_node_map_old **remote_nodemaps)
3265{
3266 uint32_t *nodes;
3267
3268 nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3269 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3270 nodes, 0,
3271 CONTROL_TIMEOUT(), false, tdb_null,
3272 async_getnodemap_callback,
3273 NULL,
3274 remote_nodemaps) != 0) {
3275 DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3276
3277 return -1;
3278 }
3279
3280 return 0;
3281}
3282
3283static int update_recovery_lock_file(struct ctdb_context *ctdb)
3284{
3285 TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3286 const char *reclockfile;
3287
3288 if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3289 DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3290 talloc_free(tmp_ctx);
3291 return -1;
3292 }
3293
3294 if (reclockfile == NULL) {
3295 if (ctdb->recovery_lock_file != NULL) {
3296 DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3297 talloc_free(ctdb->recovery_lock_file);
3298 ctdb->recovery_lock_file = NULL;
3299 ctdb_recovery_unlock(ctdb);
3300 }
3301 talloc_free(tmp_ctx);
3302 return 0;
3303 }
3304
3305 if (ctdb->recovery_lock_file == NULL) {
3306 DEBUG(DEBUG_NOTICE,
3307 ("Recovery lock file enabled (%s)\n", reclockfile));
3308 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3309 ctdb_recovery_unlock(ctdb);
3310 talloc_free(tmp_ctx);
3311 return 0;
3312 }
3313
3314
3315 if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3316 talloc_free(tmp_ctx);
3317 return 0;
3318 }
3319
3320 DEBUG(DEBUG_NOTICE,
3321 ("Recovery lock file changed (now %s)\n", reclockfile));
3322 talloc_free(ctdb->recovery_lock_file);
3323 ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3324 ctdb_recovery_unlock(ctdb);
3325
3326 talloc_free(tmp_ctx);
3327 return 0;
3328}
3329
3330static enum monitor_result validate_recovery_master(struct ctdb_recoverd *rec,
3331 TALLOC_CTX *mem_ctx)
3332{
3333 struct ctdb_context *ctdb = rec->ctdb;
3334 uint32_t pnn = ctdb_get_pnn(ctdb);
3335 struct ctdb_node_map_old *nodemap = rec->nodemap;
3336 struct ctdb_node_map_old *recmaster_nodemap = NULL;
3337 int ret;
3338
3339 /* When recovery daemon is started, recmaster is set to
3340 * "unknown" so it knows to start an election.
3341 */
3342 if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3343 DEBUG(DEBUG_NOTICE,
3344 ("Initial recovery master set - forcing election\n"));
3345 return MONITOR_ELECTION_NEEDED;
3346 }
3347
3348 /*
3349 * If the current recmaster does not have CTDB_CAP_RECMASTER,
3350 * but we have, then force an election and try to become the new
3351 * recmaster.
3352 */
3353 if (!ctdb_node_has_capabilities(rec->caps,
3354 rec->recmaster,
3355 CTDB_CAP_RECMASTER) &&
3356 (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3357 !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3358 DEBUG(DEBUG_ERR,
3359 (" Current recmaster node %u does not have CAP_RECMASTER,"
3360 " but we (node %u) have - force an election\n",
3361 rec->recmaster, pnn));
3362 return MONITOR_ELECTION_NEEDED;
3363 }
3364
3365 /* Verify that the master node has not been deleted. This
3366 * should not happen because a node should always be shutdown
3367 * before being deleted, causing a new master to be elected
3368 * before now. However, if something strange has happened
3369 * then checking here will ensure we don't index beyond the
3370 * end of the nodemap array. */
3371 if (rec->recmaster >= nodemap->num) {
3372 DEBUG(DEBUG_ERR,
3373 ("Recmaster node %u has been deleted. Force election\n",
3374 rec->recmaster));
3375 return MONITOR_ELECTION_NEEDED;
3376 }
3377
3378 /* if recovery master is disconnected/deleted we must elect a new recmaster */
3379 if (nodemap->nodes[rec->recmaster].flags &
3380 (NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED)) {
3381 DEBUG(DEBUG_NOTICE,
3382 ("Recmaster node %u is disconnected/deleted. Force election\n",
3383 rec->recmaster));
3384 return MONITOR_ELECTION_NEEDED;
3385 }
3386
3387 /* get nodemap from the recovery master to check if it is inactive */
3388 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3389 mem_ctx, &recmaster_nodemap);
3390 if (ret != 0) {
3391 DEBUG(DEBUG_ERR,
3392 (__location__
3393 " Unable to get nodemap from recovery master %u\n",
3394 rec->recmaster));
3395 return MONITOR_FAILED;
3396 }
3397
3398
3399 if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3400 (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3401 DEBUG(DEBUG_NOTICE,
3402 ("Recmaster node %u is inactive. Force election\n",
3403 rec->recmaster));
3404 /*
3405 * update our nodemap to carry the recmaster's notion of
3406 * its own flags, so that we don't keep freezing the
3407 * inactive recmaster node...
3408 */
3409 nodemap->nodes[rec->recmaster].flags =
3410 recmaster_nodemap->nodes[rec->recmaster].flags;
3411 return MONITOR_ELECTION_NEEDED;
3412 }
3413
3414 return MONITOR_OK;
3415}
3416
3417static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
3418 TALLOC_CTX *mem_ctx)
3419{
3420 uint32_t pnn;
3421 struct ctdb_node_map_old *nodemap=NULL;
3422 struct ctdb_node_map_old **remote_nodemaps=NULL;
3423 struct ctdb_vnn_map *vnnmap=NULL;
3424 struct ctdb_vnn_map *remote_vnnmap=NULL;
3425 uint32_t num_lmasters;
3426 int32_t debug_level;
3427 int i, j, ret;
3428 bool self_ban;
3429
3430
3431 /* verify that the main daemon is still running */
3432 if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3433 DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3434 exit(-1);
3435 }
3436
3437 /* ping the local daemon to tell it we are alive */
3438 ctdb_ctrl_recd_ping(ctdb);
3439
3440 if (rec->election_timeout) {
3441 /* an election is in progress */
3442 return;
3443 }
3444
3445 /* read the debug level from the parent and update locally */
3446 ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3447 if (ret !=0) {
3448 DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3449 return;
3450 }
3451 DEBUGLEVEL = debug_level;
3452
3453 /* get relevant tunables */
3454 ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3455 if (ret != 0) {
3456 DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3457 return;
3458 }
3459
3460 /* get runstate */
3461 ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3462 CTDB_CURRENT_NODE, &ctdb->runstate);
3463 if (ret != 0) {
3464 DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3465 return;
3466 }
3467
3468 /* get the current recovery lock file from the server */
3469 if (update_recovery_lock_file(ctdb) != 0) {
3470 DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3471 return;
3472 }
3473
3474 pnn = ctdb_get_pnn(ctdb);
3475
3476 /* get nodemap */
3477 TALLOC_FREE(rec->nodemap);
3478 ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3479 if (ret != 0) {
3480 DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3481 return;
3482 }
3483 nodemap = rec->nodemap;
3484
3485 /* remember our own node flags */
3486 rec->node_flags = nodemap->nodes[pnn].flags;
3487
3488 ban_misbehaving_nodes(rec, &self_ban);
3489 if (self_ban) {
3490 DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3491 return;
3492 }
3493
3494 /* if the local daemon is STOPPED or BANNED, we verify that the databases are
3495 also frozen and that the recmode is set to active.
3496 */
3497 if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
3498 /* If this node has become inactive then we want to
3499 * reduce the chances of it taking over the recovery
3500 * master role when it becomes active again. This
3501 * helps to stabilise the recovery master role so that
3502 * it stays on the most stable node.
3503 */
3504 rec->priority_time = timeval_current();
3505
3506 ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3507 if (ret != 0) {
3508 DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3509 }
3510 if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3511 DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3512
3513 ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3514 if (ret != 0) {
3515 DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3516
3517 return;
3518 }
3519 }
3520 if (! rec->frozen_on_inactive) {
3521 ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
3522 CTDB_CURRENT_NODE);
3523 if (ret != 0) {
3524 DEBUG(DEBUG_ERR,
3525 (__location__ " Failed to freeze node "
3526 "in STOPPED or BANNED state\n"));
3527 return;
3528 }
3529
3530 rec->frozen_on_inactive = true;
3531 }
3532
3533 /* If this node is stopped or banned then it is not the recovery
3534 * master, so don't do anything. This prevents stopped or banned
3535 * node from starting election and sending unnecessary controls.
3536 */
3537 return;
3538 }
3539
3540 rec->frozen_on_inactive = false;
3541
3542 /* If we are not the recmaster then do some housekeeping */
3543 if (rec->recmaster != pnn) {
3544 /* Ignore any IP reallocate requests - only recmaster
3545 * processes them
3546 */
3547 TALLOC_FREE(rec->reallocate_requests);
3548 /* Clear any nodes that should be force rebalanced in
3549 * the next takeover run. If the recovery master role
3550 * has moved then we don't want to process these some
3551 * time in the future.
3552 */
3553 TALLOC_FREE(rec->force_rebalance_nodes);
3554 }
3555
3556 /* Retrieve capabilities from all connected nodes */
3557 ret = update_capabilities(rec, nodemap);
3558 if (ret != 0) {
3559 DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3560 return;
3561 }
3562
3563 switch (validate_recovery_master(rec, mem_ctx)) {
3564 case MONITOR_RECOVERY_NEEDED:
3565 /* can not happen */
3566 return;
3567 case MONITOR_ELECTION_NEEDED:
3568 force_election(rec, pnn, nodemap);
3569 return;
3570 case MONITOR_OK:
3571 break;
3572 case MONITOR_FAILED:
3573 return;
3574 }
3575
3576 /* verify that we have all ip addresses we should have and we dont
3577 * have addresses we shouldnt have.
3578 */
3579 if (ctdb->tunable.disable_ip_failover == 0 &&
3580 !ctdb_op_is_disabled(rec->takeover_run)) {
3581 if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3582 DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3583 }
3584 }
3585
3586
3587 /* if we are not the recmaster then we do not need to check
3588 if recovery is needed
3589 */
3590 if (pnn != rec->recmaster) {
3591 return;
3592 }
3593
3594
3595 /* ensure our local copies of flags are right */
3596 ret = update_local_flags(rec, nodemap);
3597 if (ret == MONITOR_ELECTION_NEEDED) {
3598 DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3599 force_election(rec, pnn, nodemap);
3600 return;
3601 }
3602 if (ret != MONITOR_OK) {
3603 DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3604 return;
3605 }
3606
3607 if (ctdb->num_nodes != nodemap->num) {
3608 DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3609 ctdb_load_nodes_file(ctdb);
3610 return;
3611 }
3612
3613 /* verify that all active nodes agree that we are the recmaster */
3614 switch (verify_recmaster(rec, nodemap, pnn)) {
3615 case MONITOR_RECOVERY_NEEDED:
3616 /* can not happen */
3617 return;
3618 case MONITOR_ELECTION_NEEDED:
3619 force_election(rec, pnn, nodemap);
3620 return;
3621 case MONITOR_OK:
3622 break;
3623 case MONITOR_FAILED:
3624 return;
3625 }
3626
3627
3628 /* get the vnnmap */
3629 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3630 if (ret != 0) {
3631 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3632 return;
3633 }
3634
3635 if (rec->need_recovery) {
3636 /* a previous recovery didn't finish */
3637 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3638 return;
3639 }
3640
3641 /* verify that all active nodes are in normal mode
3642 and not in recovery mode
3643 */
3644 switch (verify_recmode(ctdb, nodemap)) {
3645 case MONITOR_RECOVERY_NEEDED:
3646 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3647 return;
3648 case MONITOR_FAILED:
3649 return;
3650 case MONITOR_ELECTION_NEEDED:
3651 /* can not happen */
3652 case MONITOR_OK:
3653 break;
3654 }
3655
3656
3657 if (ctdb->recovery_lock_file != NULL) {
3658 /* We must already hold the recovery lock */
3659 if (!ctdb_recovery_have_lock(ctdb)) {
3660 DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3661 ctdb_set_culprit(rec, ctdb->pnn);
3662 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3663 return;
3664 }
3665 }
3666
3667
3668 /* if there are takeovers requested, perform it and notify the waiters */
3669 if (!ctdb_op_is_disabled(rec->takeover_run) &&
3670 rec->reallocate_requests) {
3671 process_ipreallocate_requests(ctdb, rec);
3672 }
3673
3674 /* If recoveries are disabled then there is no use doing any
3675 * nodemap or flags checks. Recoveries might be disabled due
3676 * to "reloadnodes", so doing these checks might cause an
3677 * unnecessary recovery. */
3678 if (ctdb_op_is_disabled(rec->recovery)) {
3679 return;
3680 }
3681
3682 /* get the nodemap for all active remote nodes
3683 */
3684 remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3685 if (remote_nodemaps == NULL) {
3686 DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3687 return;
3688 }
3689 for(i=0; i<nodemap->num; i++) {
3690 remote_nodemaps[i] = NULL;
3691 }
3692 if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3693 DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3694 return;
3695 }
3696
3697 /* verify that all other nodes have the same nodemap as we have
3698 */
3699 for (j=0; j<nodemap->num; j++) {
3700 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3701 continue;
3702 }
3703
3704 if (remote_nodemaps[j] == NULL) {
3705 DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3706 ctdb_set_culprit(rec, j);
3707
3708 return;
3709 }
3710
3711 /* if the nodes disagree on how many nodes there are
3712 then this is a good reason to try recovery
3713 */
3714 if (remote_nodemaps[j]->num != nodemap->num) {
3715 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3716 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3717 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3718 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3719 return;
3720 }
3721
3722 /* if the nodes disagree on which nodes exist and are
3723 active, then that is also a good reason to do recovery
3724 */
3725 for (i=0;i<nodemap->num;i++) {
3726 if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3727 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3728 nodemap->nodes[j].pnn, i,
3729 remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3730 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3731 do_recovery(rec, mem_ctx, pnn, nodemap,
3732 vnnmap);
3733 return;
3734 }
3735 }
3736 }
3737
3738 /*
3739 * Update node flags obtained from each active node. This ensure we have
3740 * up-to-date information for all the nodes.
3741 */
3742 for (j=0; j<nodemap->num; j++) {
3743 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3744 continue;
3745 }
3746 nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3747 }
3748
3749 for (j=0; j<nodemap->num; j++) {
3750 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3751 continue;
3752 }
3753
3754 /* verify the flags are consistent
3755 */
3756 for (i=0; i<nodemap->num; i++) {
3757 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3758 continue;
3759 }
3760
3761 if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3762 DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3763 nodemap->nodes[j].pnn,
3764 nodemap->nodes[i].pnn,
3765 remote_nodemaps[j]->nodes[i].flags,
3766 nodemap->nodes[i].flags));
3767 if (i == j) {
3768 DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3769 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3770 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3771 do_recovery(rec, mem_ctx, pnn, nodemap,
3772 vnnmap);
3773 return;
3774 } else {
3775 DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3776 update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3777 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3778 do_recovery(rec, mem_ctx, pnn, nodemap,
3779 vnnmap);
3780 return;
3781 }
3782 }
3783 }
3784 }
3785
3786
3787 /* count how many active nodes there are */
3788 num_lmasters = 0;
3789 for (i=0; i<nodemap->num; i++) {
3790 if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3791 if (ctdb_node_has_capabilities(rec->caps,
3792 ctdb->nodes[i]->pnn,
3793 CTDB_CAP_LMASTER)) {
3794 num_lmasters++;
3795 }
3796 }
3797 }
3798
3799
3800 /* There must be the same number of lmasters in the vnn map as
3801 * there are active nodes with the lmaster capability... or
3802 * do a recovery.
3803 */
3804 if (vnnmap->size != num_lmasters) {
3805 DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3806 vnnmap->size, num_lmasters));
3807 ctdb_set_culprit(rec, ctdb->pnn);
3808 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3809 return;
3810 }
3811
3812 /* verify that all active nodes in the nodemap also exist in
3813 the vnnmap.
3814 */
3815 for (j=0; j<nodemap->num; j++) {
3816 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3817 continue;
3818 }
3819 if (nodemap->nodes[j].pnn == pnn) {
3820 continue;
3821 }
3822
3823 for (i=0; i<vnnmap->size; i++) {
3824 if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3825 break;
3826 }
3827 }
3828 if (i == vnnmap->size) {
3829 DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3830 nodemap->nodes[j].pnn));
3831 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3832 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3833 return;
3834 }
3835 }
3836
3837
3838 /* verify that all other nodes have the same vnnmap
3839 and are from the same generation
3840 */
3841 for (j=0; j<nodemap->num; j++) {
3842 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3843 continue;
3844 }
3845 if (nodemap->nodes[j].pnn == pnn) {
3846 continue;
3847 }
3848
3849 ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3850 mem_ctx, &remote_vnnmap);
3851 if (ret != 0) {
3852 DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3853 nodemap->nodes[j].pnn));
3854 return;
3855 }
3856
3857 /* verify the vnnmap generation is the same */
3858 if (vnnmap->generation != remote_vnnmap->generation) {
3859 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3860 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3861 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3862 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3863 return;
3864 }
3865
3866 /* verify the vnnmap size is the same */
3867 if (vnnmap->size != remote_vnnmap->size) {
3868 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3869 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3870 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3871 do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3872 return;
3873 }
3874
3875 /* verify the vnnmap is the same */
3876 for (i=0;i<vnnmap->size;i++) {
3877 if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3878 DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3879 nodemap->nodes[j].pnn));
3880 ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3881 do_recovery(rec, mem_ctx, pnn, nodemap,
3882 vnnmap);
3883 return;
3884 }
3885 }
3886 }
3887
3888 /* we might need to change who has what IP assigned */
3889 if (rec->need_takeover_run) {
3890 /* If takeover run fails, then the offending nodes are
3891 * assigned ban culprit counts. And we re-try takeover.
3892 * If takeover run fails repeatedly, the node would get
3893 * banned.
3894 */
3895 do_takeover_run(rec, nodemap, true);
3896 }
3897}
3898
3899/*
3900 the main monitoring loop
3901 */
3902static void monitor_cluster(struct ctdb_context *ctdb)
3903{
3904 struct ctdb_recoverd *rec;
3905
3906 DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3907
3908 rec = talloc_zero(ctdb, struct ctdb_recoverd);
3909 CTDB_NO_MEMORY_FATAL(ctdb, rec);
3910
3911 rec->ctdb = ctdb;
3912 rec->recmaster = CTDB_UNKNOWN_PNN;
3913
3914 rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3915 CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3916
3917 rec->recovery = ctdb_op_init(rec, "recoveries");
3918 CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3919
3920 rec->priority_time = timeval_current();
3921 rec->frozen_on_inactive = false;
3922
3923 /* register a message port for sending memory dumps */
3924 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3925
3926 /* when a node is assigned banning credits */
3927 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3928 banning_handler, rec);
3929
3930 /* register a message port for recovery elections */
3931 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3932
3933 /* when nodes are disabled/enabled */
3934 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3935
3936 /* when we are asked to puch out a flag change */
3937 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3938
3939 /* register a message port for vacuum fetch */
3940 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3941
3942 /* register a message port for reloadnodes */
3943 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3944
3945 /* register a message port for performing a takeover run */
3946 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3947
3948 /* register a message port for disabling the ip check for a short while */
3949 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3950
3951 /* register a message port for updating the recovery daemons node assignment for an ip */
3952 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3953
3954 /* register a message port for forcing a rebalance of a node next
3955 reallocation */
3956 ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3957
3958 /* Register a message port for disabling takeover runs */
3959 ctdb_client_set_message_handler(ctdb,
3960 CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3961 disable_takeover_runs_handler, rec);
3962
3963 /* Register a message port for disabling recoveries */
3964 ctdb_client_set_message_handler(ctdb,
3965 CTDB_SRVID_DISABLE_RECOVERIES,
3966 disable_recoveries_handler, rec);
3967
3968 /* register a message port for detaching database */
3969 ctdb_client_set_message_handler(ctdb,
3970 CTDB_SRVID_DETACH_DATABASE,
3971 detach_database_handler, rec);
3972
3973 for (;;) {
3974 TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3975 struct timeval start;
3976 double elapsed;
3977
3978 if (!mem_ctx) {
3979 DEBUG(DEBUG_CRIT,(__location__
3980 " Failed to create temp context\n"));
3981 exit(-1);
3982 }
3983
3984 start = timeval_current();
3985 main_loop(ctdb, rec, mem_ctx);
3986 talloc_free(mem_ctx);
3987
3988 /* we only check for recovery once every second */
3989 elapsed = timeval_elapsed(&start);
3990 if (elapsed < ctdb->tunable.recover_interval) {
3991 ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3992 - elapsed);
3993 }
3994 }
3995}
3996
3997/*
3998 event handler for when the main ctdbd dies
3999 */
4000static void ctdb_recoverd_parent(struct tevent_context *ev,
4001 struct tevent_fd *fde,
4002 uint16_t flags, void *private_data)
4003{
4004 DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4005 _exit(1);
4006}
4007
4008/*
4009 called regularly to verify that the recovery daemon is still running
4010 */
4011static void ctdb_check_recd(struct tevent_context *ev,
4012 struct tevent_timer *te,
4013 struct timeval yt, void *p)
4014{
4015 struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4016
4017 if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4018 DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4019
4020 tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
4021 ctdb_restart_recd, ctdb);
4022
4023 return;
4024 }
4025
4026 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4027 timeval_current_ofs(30, 0),
4028 ctdb_check_recd, ctdb);
4029}
4030
4031static void recd_sig_child_handler(struct tevent_context *ev,
4032 struct tevent_signal *se, int signum,
4033 int count, void *dont_care,
4034 void *private_data)
4035{
4036// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4037 int status;
4038 pid_t pid = -1;
4039
4040 while (pid != 0) {
4041 pid = waitpid(-1, &status, WNOHANG);
4042 if (pid == -1) {
4043 if (errno != ECHILD) {
4044 DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4045 }
4046 return;
4047 }
4048 if (pid > 0) {
4049 DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4050 }
4051 }
4052}
4053
4054/*
4055 startup the recovery daemon as a child of the main ctdb daemon
4056 */
4057int ctdb_start_recoverd(struct ctdb_context *ctdb)
4058{
4059 int fd[2];
4060 struct tevent_signal *se;
4061 struct tevent_fd *fde;
4062
4063 if (pipe(fd) != 0) {
4064 return -1;
4065 }
4066
4067 ctdb->recoverd_pid = ctdb_fork(ctdb);
4068 if (ctdb->recoverd_pid == -1) {
4069 return -1;
4070 }
4071
4072 if (ctdb->recoverd_pid != 0) {
4073 talloc_free(ctdb->recd_ctx);
4074 ctdb->recd_ctx = talloc_new(ctdb);
4075 CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4076
4077 close(fd[0]);
4078 tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4079 timeval_current_ofs(30, 0),
4080 ctdb_check_recd, ctdb);
4081 return 0;
4082 }
4083
4084 close(fd[1]);
4085
4086 srandom(getpid() ^ time(NULL));
4087
4088 prctl_set_comment("ctdb_recovered");
4089 if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4090 DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4091 exit(1);
4092 }
4093
4094 DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4095
4096 fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4097 ctdb_recoverd_parent, &fd[0]);
4098 tevent_fd_set_auto_close(fde);
4099
4100 /* set up a handler to pick up sigchld */
4101 se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4102 recd_sig_child_handler, ctdb);
4103 if (se == NULL) {
4104 DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4105 exit(1);
4106 }
4107
4108 monitor_cluster(ctdb);
4109
4110 DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4111 return -1;
4112}
4113
4114/*
4115 shutdown the recovery daemon
4116 */
4117void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4118{
4119 if (ctdb->recoverd_pid == 0) {
4120 return;
4121 }
4122
4123 DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4124 ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4125
4126 TALLOC_FREE(ctdb->recd_ctx);
4127 TALLOC_FREE(ctdb->recd_ping_count);
4128}
4129
4130static void ctdb_restart_recd(struct tevent_context *ev,
4131 struct tevent_timer *te,
4132 struct timeval t, void *private_data)
4133{
4134 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4135
4136 DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4137 ctdb_stop_recoverd(ctdb);
4138 ctdb_start_recoverd(ctdb);
4139}
Note: See TracBrowser for help on using the repository browser.