Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

ctdb_recoverd.c

Visit:

Last change on this file was 989, checked in by Silvan Scherrer, 9 years ago
Samba Server: update vendor to version 4.4.7
File size: 112.9 KB

Line
1	/*
2	ctdb recovery daemon
3
4	Copyright (C) Ronnie Sahlberg 2007
5
6	This program is free software; you can redistribute it and/or modify
7	it under the terms of the GNU General Public License as published by
8	the Free Software Foundation; either version 3 of the License, or
9	(at your option) any later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with this program; if not, see <http://www.gnu.org/licenses/>.
18	*/
19
20	#include "replace.h"
21	#include "system/filesys.h"
22	#include "system/time.h"
23	#include "system/network.h"
24	#include "system/wait.h"
25
26	#include <popt.h>
27	#include <talloc.h>
28	#include <tevent.h>
29	#include <tdb.h>
30
31	#include "lib/tdb_wrap/tdb_wrap.h"
32	#include "lib/util/dlinklist.h"
33	#include "lib/util/debug.h"
34	#include "lib/util/samba_util.h"
35	#include "lib/util/util_process.h"
36
37	#include "ctdb_private.h"
38	#include "ctdb_client.h"
39
40	#include "common/system.h"
41	#include "common/cmdline.h"
42	#include "common/common.h"
43	#include "common/logging.h"
44
45
46	/* List of SRVID requests that need to be processed */
47	struct srvid_list {
48	struct srvid_list next, prev;
49	struct ctdb_srvid_message *request;
50	};
51
52	struct srvid_requests {
53	struct srvid_list *requests;
54	};
55
56	static void srvid_request_reply(struct ctdb_context *ctdb,
57	struct ctdb_srvid_message *request,
58	TDB_DATA result)
59	{
60	/* Someone that sent srvid==0 does not want a reply */
61	if (request->srvid == 0) {
62	talloc_free(request);
63	return;
64	}
65
66	if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
67	result) == 0) {
68	DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
69	(unsigned)request->pnn,
70	(unsigned long long)request->srvid));
71	} else {
72	DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
73	(unsigned)request->pnn,
74	(unsigned long long)request->srvid));
75	}
76
77	talloc_free(request);
78	}
79
80	static void srvid_requests_reply(struct ctdb_context *ctdb,
81	struct srvid_requests **requests,
82	TDB_DATA result)
83	{
84	struct srvid_list *r;
85
86	for (r = (*requests)->requests; r != NULL; r = r->next) {
87	srvid_request_reply(ctdb, r->request, result);
88	}
89
90	/* Free the list structure... */
91	TALLOC_FREE(*requests);
92	}
93
94	static void srvid_request_add(struct ctdb_context *ctdb,
95	struct srvid_requests **requests,
96	struct ctdb_srvid_message *request)
97	{
98	struct srvid_list *t;
99	int32_t ret;
100	TDB_DATA result;
101
102	if (*requests == NULL) {
103	*requests = talloc_zero(ctdb, struct srvid_requests);
104	if (*requests == NULL) {
105	goto nomem;
106	}
107	}
108
109	t = talloc_zero(*requests, struct srvid_list);
110	if (t == NULL) {
111	/* If requests was just allocated above then free it /
112	if ((*requests)->requests == NULL) {
113	TALLOC_FREE(*requests);
114	}
115	goto nomem;
116	}
117
118	t->request = (struct ctdb_srvid_message *)talloc_steal(t, request);
119	DLIST_ADD((*requests)->requests, t);
120
121	return;
122
123	nomem:
124	/* Failed to add the request to the list. Send a fail. */
125	DEBUG(DEBUG_ERR, (__location__
126	" Out of memory, failed to queue SRVID request\n"));
127	ret = -ENOMEM;
128	result.dsize = sizeof(ret);
129	result.dptr = (uint8_t *)&ret;
130	srvid_request_reply(ctdb, request, result);
131	}
132
133	/* An abstraction to allow an operation (takeover runs, recoveries,
134	* ...) to be disabled for a given timeout */
135	struct ctdb_op_state {
136	struct tevent_timer *timer;
137	bool in_progress;
138	const char *name;
139	};
140
141	static struct ctdb_op_state ctdb_op_init(TALLOC_CTX mem_ctx, const char *name)
142	{
143	struct ctdb_op_state *state = talloc_zero(mem_ctx, struct ctdb_op_state);
144
145	if (state != NULL) {
146	state->in_progress = false;
147	state->name = name;
148	}
149
150	return state;
151	}
152
153	static bool ctdb_op_is_disabled(struct ctdb_op_state *state)
154	{
155	return state->timer != NULL;
156	}
157
158	static bool ctdb_op_begin(struct ctdb_op_state *state)
159	{
160	if (ctdb_op_is_disabled(state)) {
161	DEBUG(DEBUG_NOTICE,
162	("Unable to begin - %s are disabled\n", state->name));
163	return false;
164	}
165
166	state->in_progress = true;
167	return true;
168	}
169
170	static bool ctdb_op_end(struct ctdb_op_state *state)
171	{
172	return state->in_progress = false;
173	}
174
175	static bool ctdb_op_is_in_progress(struct ctdb_op_state *state)
176	{
177	return state->in_progress;
178	}
179
180	static void ctdb_op_enable(struct ctdb_op_state *state)
181	{
182	TALLOC_FREE(state->timer);
183	}
184
185	static void ctdb_op_timeout_handler(struct tevent_context *ev,
186	struct tevent_timer *te,
187	struct timeval yt, void *p)
188	{
189	struct ctdb_op_state *state =
190	talloc_get_type(p, struct ctdb_op_state);
191
192	DEBUG(DEBUG_NOTICE,("Reenabling %s after timeout\n", state->name));
193	ctdb_op_enable(state);
194	}
195
196	static int ctdb_op_disable(struct ctdb_op_state *state,
197	struct tevent_context *ev,
198	uint32_t timeout)
199	{
200	if (timeout == 0) {
201	DEBUG(DEBUG_NOTICE,("Reenabling %s\n", state->name));
202	ctdb_op_enable(state);
203	return 0;
204	}
205
206	if (state->in_progress) {
207	DEBUG(DEBUG_ERR,
208	("Unable to disable %s - in progress\n", state->name));
209	return -EAGAIN;
210	}
211
212	DEBUG(DEBUG_NOTICE,("Disabling %s for %u seconds\n",
213	state->name, timeout));
214
215	/* Clear any old timers */
216	talloc_free(state->timer);
217
218	/* Arrange for the timeout to occur */
219	state->timer = tevent_add_timer(ev, state,
220	timeval_current_ofs(timeout, 0),
221	ctdb_op_timeout_handler, state);
222	if (state->timer == NULL) {
223	DEBUG(DEBUG_ERR,(__location__ " Unable to setup timer\n"));
224	return -ENOMEM;
225	}
226
227	return 0;
228	}
229
230	struct ctdb_banning_state {
231	uint32_t count;
232	struct timeval last_reported_time;
233	};
234
235	/*
236	private state of recovery daemon
237	*/
238	struct ctdb_recoverd {
239	struct ctdb_context *ctdb;
240	uint32_t recmaster;
241	uint32_t last_culprit_node;
242	struct ctdb_node_map_old *nodemap;
243	struct timeval priority_time;
244	bool need_takeover_run;
245	bool need_recovery;
246	uint32_t node_flags;
247	struct tevent_timer *send_election_te;
248	struct tevent_timer *election_timeout;
249	struct srvid_requests *reallocate_requests;
250	struct ctdb_op_state *takeover_run;
251	struct ctdb_op_state *recovery;
252	struct ctdb_iface_list_old *ifaces;
253	uint32_t *force_rebalance_nodes;
254	struct ctdb_node_capabilities *caps;
255	bool frozen_on_inactive;
256	};
257
258	#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
259	#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
260
261	static void ctdb_restart_recd(struct tevent_context *ev,
262	struct tevent_timer *te, struct timeval t,
263	void *private_data);
264
265	/*
266	ban a node for a period of time
267	*/
268	static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
269	{
270	int ret;
271	struct ctdb_context *ctdb = rec->ctdb;
272	struct ctdb_ban_state bantime;
273
274	if (!ctdb_validate_pnn(ctdb, pnn)) {
275	DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
276	return;
277	}
278
279	DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
280
281	bantime.pnn = pnn;
282	bantime.time = ban_time;
283
284	ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
285	if (ret != 0) {
286	DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
287	return;
288	}
289
290	}
291
292	enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
293
294
295	/*
296	remember the trouble maker
297	*/
298	static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
299	{
300	struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
301	struct ctdb_banning_state *ban_state;
302
303	if (culprit > ctdb->num_nodes) {
304	DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
305	return;
306	}
307
308	/* If we are banned or stopped, do not set other nodes as culprits */
309	if (rec->node_flags & NODE_FLAGS_INACTIVE) {
310	DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
311	return;
312	}
313
314	if (ctdb->nodes[culprit]->ban_state == NULL) {
315	ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
316	CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
317
318
319	}
320	ban_state = ctdb->nodes[culprit]->ban_state;
321	if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
322	/* this was the first time in a long while this node
323	misbehaved so we will forgive any old transgressions.
324	*/
325	ban_state->count = 0;
326	}
327
328	ban_state->count += count;
329	ban_state->last_reported_time = timeval_current();
330	rec->last_culprit_node = culprit;
331	}
332
333	/*
334	remember the trouble maker
335	*/
336	static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
337	{
338	ctdb_set_culprit_count(rec, culprit, 1);
339	}
340
341
342	/* this callback is called for every node that failed to execute the
343	recovered event
344	*/
345	static void recovered_fail_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
346	{
347	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
348
349	DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
350
351	ctdb_set_culprit(rec, node_pnn);
352	}
353
354	/*
355	run the "recovered" eventscript on all nodes
356	*/
357	static int run_recovered_eventscript(struct ctdb_recoverd rec, struct ctdb_node_map_old nodemap, const char *caller)
358	{
359	TALLOC_CTX *tmp_ctx;
360	uint32_t *nodes;
361	struct ctdb_context *ctdb = rec->ctdb;
362
363	tmp_ctx = talloc_new(ctdb);
364	CTDB_NO_MEMORY(ctdb, tmp_ctx);
365
366	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
367	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
368	nodes, 0,
369	CONTROL_TIMEOUT(), false, tdb_null,
370	NULL, recovered_fail_callback,
371	rec) != 0) {
372	DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
373
374	talloc_free(tmp_ctx);
375	return -1;
376	}
377
378	talloc_free(tmp_ctx);
379	return 0;
380	}
381
382	/* this callback is called for every node that failed to execute the
383	start recovery event
384	*/
385	static void startrecovery_fail_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
386	{
387	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
388
389	DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
390
391	ctdb_set_culprit(rec, node_pnn);
392	}
393
394	/*
395	run the "startrecovery" eventscript on all nodes
396	*/
397	static int run_startrecovery_eventscript(struct ctdb_recoverd rec, struct ctdb_node_map_old nodemap)
398	{
399	TALLOC_CTX *tmp_ctx;
400	uint32_t *nodes;
401	struct ctdb_context *ctdb = rec->ctdb;
402
403	tmp_ctx = talloc_new(ctdb);
404	CTDB_NO_MEMORY(ctdb, tmp_ctx);
405
406	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
407	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
408	nodes, 0,
409	CONTROL_TIMEOUT(), false, tdb_null,
410	NULL,
411	startrecovery_fail_callback,
412	rec) != 0) {
413	DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
414	talloc_free(tmp_ctx);
415	return -1;
416	}
417
418	talloc_free(tmp_ctx);
419	return 0;
420	}
421
422	/*
423	Retrieve capabilities from all connected nodes
424	*/
425	static int update_capabilities(struct ctdb_recoverd *rec,
426	struct ctdb_node_map_old *nodemap)
427	{
428	uint32_t *capp;
429	TALLOC_CTX *tmp_ctx;
430	struct ctdb_node_capabilities *caps;
431	struct ctdb_context *ctdb = rec->ctdb;
432
433	tmp_ctx = talloc_new(rec);
434	CTDB_NO_MEMORY(ctdb, tmp_ctx);
435
436	caps = ctdb_get_capabilities(ctdb, tmp_ctx,
437	CONTROL_TIMEOUT(), nodemap);
438
439	if (caps == NULL) {
440	DEBUG(DEBUG_ERR,
441	(__location__ " Failed to get node capabilities\n"));
442	talloc_free(tmp_ctx);
443	return -1;
444	}
445
446	capp = ctdb_get_node_capabilities(caps, ctdb_get_pnn(ctdb));
447	if (capp == NULL) {
448	DEBUG(DEBUG_ERR,
449	(__location__
450	" Capabilities don't include current node.\n"));
451	talloc_free(tmp_ctx);
452	return -1;
453	}
454	ctdb->capabilities = *capp;
455
456	TALLOC_FREE(rec->caps);
457	rec->caps = talloc_steal(rec, caps);
458
459	talloc_free(tmp_ctx);
460	return 0;
461	}
462
463	static void set_recmode_fail_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
464	{
465	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
466
467	DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
468	ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
469	}
470
471	static void transaction_start_fail_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
472	{
473	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
474
475	DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
476	ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
477	}
478
479	/*
480	change recovery mode on all nodes
481	*/
482	static int set_recovery_mode(struct ctdb_context *ctdb,
483	struct ctdb_recoverd *rec,
484	struct ctdb_node_map_old *nodemap,
485	uint32_t rec_mode, bool freeze)
486	{
487	TDB_DATA data;
488	uint32_t *nodes;
489	TALLOC_CTX *tmp_ctx;
490
491	tmp_ctx = talloc_new(ctdb);
492	CTDB_NO_MEMORY(ctdb, tmp_ctx);
493
494	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
495
496	data.dsize = sizeof(uint32_t);
497	data.dptr = (unsigned char *)&rec_mode;
498
499	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
500	nodes, 0,
501	CONTROL_TIMEOUT(),
502	false, data,
503	NULL, NULL,
504	NULL) != 0) {
505	DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
506	talloc_free(tmp_ctx);
507	return -1;
508	}
509
510	/* freeze all nodes */
511	if (freeze && rec_mode == CTDB_RECOVERY_ACTIVE) {
512	int i;
513
514	for (i=1; i<=NUM_DB_PRIORITIES; i++) {
515	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
516	nodes, i,
517	CONTROL_TIMEOUT(),
518	false, tdb_null,
519	NULL,
520	set_recmode_fail_callback,
521	rec) != 0) {
522	DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
523	talloc_free(tmp_ctx);
524	return -1;
525	}
526	}
527	}
528
529	talloc_free(tmp_ctx);
530	return 0;
531	}
532
533	/* update all remote nodes to use the same db priority that we have
534	this can fail if the remove node has not yet been upgraded to
535	support this function, so we always return success and never fail
536	a recovery if this call fails.
537	*/
538	static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
539	struct ctdb_node_map_old *nodemap,
540	uint32_t pnn, struct ctdb_dbid_map_old dbmap, TALLOC_CTX mem_ctx)
541	{
542	int db;
543
544	/* step through all local databases */
545	for (db=0; db<dbmap->num;db++) {
546	struct ctdb_db_priority db_prio;
547	int ret;
548
549	db_prio.db_id = dbmap->dbs[db].db_id;
550	ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].db_id, &db_prio.priority);
551	if (ret != 0) {
552	DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].db_id));
553	continue;
554	}
555
556	DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].db_id, db_prio.priority));
557
558	ret = ctdb_ctrl_set_db_priority(ctdb, CONTROL_TIMEOUT(),
559	CTDB_CURRENT_NODE, &db_prio);
560	if (ret != 0) {
561	DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n",
562	db_prio.db_id));
563	}
564	}
565
566	return 0;
567	}
568
569	/*
570	ensure all other nodes have attached to any databases that we have
571	*/
572	static int create_missing_remote_databases(struct ctdb_context ctdb, struct ctdb_node_map_old nodemap,
573	uint32_t pnn, struct ctdb_dbid_map_old dbmap, TALLOC_CTX mem_ctx)
574	{
575	int i, j, db, ret;
576	struct ctdb_dbid_map_old *remote_dbmap;
577
578	/* verify that all other nodes have all our databases */
579	for (j=0; j<nodemap->num; j++) {
580	/* we don't need to ourself ourselves */
581	if (nodemap->nodes[j].pnn == pnn) {
582	continue;
583	}
584	/* don't check nodes that are unavailable */
585	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
586	continue;
587	}
588
589	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
590	mem_ctx, &remote_dbmap);
591	if (ret != 0) {
592	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
593	return -1;
594	}
595
596	/* step through all local databases */
597	for (db=0; db<dbmap->num;db++) {
598	const char *name;
599
600
601	for (i=0;i<remote_dbmap->num;i++) {
602	if (dbmap->dbs[db].db_id == remote_dbmap->dbs[i].db_id) {
603	break;
604	}
605	}
606	/* the remote node already have this database */
607	if (i!=remote_dbmap->num) {
608	continue;
609	}
610	/* ok so we need to create this database */
611	ret = ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn,
612	dbmap->dbs[db].db_id, mem_ctx,
613	&name);
614	if (ret != 0) {
615	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
616	return -1;
617	}
618	ret = ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(),
619	nodemap->nodes[j].pnn,
620	mem_ctx, name,
621	dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
622	if (ret != 0) {
623	DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
624	return -1;
625	}
626	}
627	}
628
629	return 0;
630	}
631
632
633	/*
634	ensure we are attached to any databases that anyone else is attached to
635	*/
636	static int create_missing_local_databases(struct ctdb_context ctdb, struct ctdb_node_map_old nodemap,
637	uint32_t pnn, struct ctdb_dbid_map_old *dbmap, TALLOC_CTX mem_ctx)
638	{
639	int i, j, db, ret;
640	struct ctdb_dbid_map_old *remote_dbmap;
641
642	/* verify that we have all database any other node has */
643	for (j=0; j<nodemap->num; j++) {
644	/* we don't need to ourself ourselves */
645	if (nodemap->nodes[j].pnn == pnn) {
646	continue;
647	}
648	/* don't check nodes that are unavailable */
649	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
650	continue;
651	}
652
653	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
654	mem_ctx, &remote_dbmap);
655	if (ret != 0) {
656	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
657	return -1;
658	}
659
660	/* step through all databases on the remote node */
661	for (db=0; db<remote_dbmap->num;db++) {
662	const char *name;
663
664	for (i=0;i<(*dbmap)->num;i++) {
665	if (remote_dbmap->dbs[db].db_id == (*dbmap)->dbs[i].db_id) {
666	break;
667	}
668	}
669	/* we already have this db locally */
670	if (i!=(*dbmap)->num) {
671	continue;
672	}
673	/* ok so we need to create this database and
674	rebuild dbmap
675	*/
676	ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
677	remote_dbmap->dbs[db].db_id, mem_ctx, &name);
678	if (ret != 0) {
679	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n",
680	nodemap->nodes[j].pnn));
681	return -1;
682	}
683	ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name,
684	remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
685	if (ret != 0) {
686	DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
687	return -1;
688	}
689	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
690	if (ret != 0) {
691	DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
692	return -1;
693	}
694	}
695	}
696
697	return 0;
698	}
699
700
701	/*
702	pull the remote database contents from one node into the recdb
703	*/
704	static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode,
705	struct tdb_wrap *recdb, uint32_t dbid)
706	{
707	int ret;
708	TDB_DATA outdata;
709	struct ctdb_marshall_buffer *reply;
710	struct ctdb_rec_data_old *recdata;
711	int i;
712	TALLOC_CTX *tmp_ctx = talloc_new(recdb);
713
714	ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
715	CONTROL_TIMEOUT(), &outdata);
716	if (ret != 0) {
717	DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
718	talloc_free(tmp_ctx);
719	return -1;
720	}
721
722	reply = (struct ctdb_marshall_buffer *)outdata.dptr;
723
724	if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
725	DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
726	talloc_free(tmp_ctx);
727	return -1;
728	}
729
730	recdata = (struct ctdb_rec_data_old *)&reply->data[0];
731
732	for (i=0;
733	i<reply->count;
734	recdata = (struct ctdb_rec_data_old )(recdata->length + (uint8_t )recdata), i++) {
735	TDB_DATA key, data;
736	struct ctdb_ltdb_header *hdr;
737	TDB_DATA existing;
738
739	key.dptr = &recdata->data[0];
740	key.dsize = recdata->keylen;
741	data.dptr = &recdata->data[key.dsize];
742	data.dsize = recdata->datalen;
743
744	hdr = (struct ctdb_ltdb_header *)data.dptr;
745
746	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
747	DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
748	talloc_free(tmp_ctx);
749	return -1;
750	}
751
752	/* fetch the existing record, if any */
753	existing = tdb_fetch(recdb->tdb, key);
754
755	if (existing.dptr != NULL) {
756	struct ctdb_ltdb_header header;
757	if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
758	DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n",
759	(unsigned)existing.dsize, srcnode));
760	free(existing.dptr);
761	talloc_free(tmp_ctx);
762	return -1;
763	}
764	header = (struct ctdb_ltdb_header )existing.dptr;
765	free(existing.dptr);
766	if (!(header.rsn < hdr->rsn \|\|
767	(header.dmaster != ctdb_get_pnn(ctdb) &&
768	header.rsn == hdr->rsn))) {
769	continue;
770	}
771	}
772
773	if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
774	DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
775	talloc_free(tmp_ctx);
776	return -1;
777	}
778	}
779
780	talloc_free(tmp_ctx);
781
782	return 0;
783	}
784
785
786	struct pull_seqnum_cbdata {
787	int failed;
788	uint32_t pnn;
789	uint64_t seqnum;
790	};
791
792	static void pull_seqnum_cb(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
793	{
794	struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
795	uint64_t seqnum;
796
797	if (cb_data->failed != 0) {
798	DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
799	return;
800	}
801
802	if (res != 0) {
803	DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
804	cb_data->failed = 1;
805	return;
806	}
807
808	if (outdata.dsize != sizeof(uint64_t)) {
809	DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
810	cb_data->failed = -1;
811	return;
812	}
813
814	seqnum = ((uint64_t )outdata.dptr);
815
816	if (seqnum > cb_data->seqnum \|\|
817	(cb_data->pnn == -1 && seqnum == 0)) {
818	cb_data->seqnum = seqnum;
819	cb_data->pnn = node_pnn;
820	}
821	}
822
823	static void pull_seqnum_fail_cb(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
824	{
825	struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
826
827	DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
828	cb_data->failed = 1;
829	}
830
831	static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
832	struct ctdb_recoverd *rec,
833	struct ctdb_node_map_old *nodemap,
834	struct tdb_wrap *recdb, uint32_t dbid)
835	{
836	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
837	uint32_t *nodes;
838	TDB_DATA data;
839	uint32_t outdata[2];
840	struct pull_seqnum_cbdata *cb_data;
841
842	DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
843
844	outdata[0] = dbid;
845	outdata[1] = 0;
846
847	data.dsize = sizeof(outdata);
848	data.dptr = (uint8_t *)&outdata[0];
849
850	cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
851	if (cb_data == NULL) {
852	DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
853	talloc_free(tmp_ctx);
854	return -1;
855	}
856
857	cb_data->failed = 0;
858	cb_data->pnn = -1;
859	cb_data->seqnum = 0;
860
861	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
862	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
863	nodes, 0,
864	CONTROL_TIMEOUT(), false, data,
865	pull_seqnum_cb,
866	pull_seqnum_fail_cb,
867	cb_data) != 0) {
868	DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
869
870	talloc_free(tmp_ctx);
871	return -1;
872	}
873
874	if (cb_data->failed != 0) {
875	DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
876	talloc_free(tmp_ctx);
877	return -1;
878	}
879
880	if (cb_data->pnn == -1) {
881	DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
882	talloc_free(tmp_ctx);
883	return -1;
884	}
885
886	DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum));
887
888	if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
889	DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
890	talloc_free(tmp_ctx);
891	return -1;
892	}
893
894	talloc_free(tmp_ctx);
895	return 0;
896	}
897
898
899	/*
900	pull all the remote database contents into the recdb
901	*/
902	static int pull_remote_database(struct ctdb_context *ctdb,
903	struct ctdb_recoverd *rec,
904	struct ctdb_node_map_old *nodemap,
905	struct tdb_wrap *recdb, uint32_t dbid,
906	bool persistent)
907	{
908	int j;
909
910	if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
911	int ret;
912	ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
913	if (ret == 0) {
914	return 0;
915	}
916	}
917
918	/* pull all records from all other nodes across onto this node
919	(this merges based on rsn)
920	*/
921	for (j=0; j<nodemap->num; j++) {
922	/* don't merge from nodes that are unavailable */
923	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
924	continue;
925	}
926	if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
927	DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n",
928	nodemap->nodes[j].pnn));
929	ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
930	return -1;
931	}
932	}
933
934	return 0;
935	}
936
937
938	/*
939	update flags on all active nodes
940	*/
941	static int update_flags_on_all_nodes(struct ctdb_context ctdb, struct ctdb_node_map_old nodemap, uint32_t pnn, uint32_t flags)
942	{
943	int ret;
944
945	ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
946	if (ret != 0) {
947	DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
948	return -1;
949	}
950
951	return 0;
952	}
953
954	/*
955	ensure all nodes have the same vnnmap we do
956	*/
957	static int update_vnnmap_on_all_nodes(struct ctdb_context ctdb, struct ctdb_node_map_old nodemap,
958	uint32_t pnn, struct ctdb_vnn_map vnnmap, TALLOC_CTX mem_ctx)
959	{
960	int j, ret;
961
962	/* push the new vnn map out to all the nodes */
963	for (j=0; j<nodemap->num; j++) {
964	/* don't push to nodes that are unavailable */
965	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
966	continue;
967	}
968
969	ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
970	if (ret != 0) {
971	DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
972	return -1;
973	}
974	}
975
976	return 0;
977	}
978
979
980	/*
981	called when a vacuum fetch has completed - just free it and do the next one
982	*/
983	static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
984	{
985	talloc_free(state);
986	}
987
988
989	/**
990	* Process one elements of the vacuum fetch list:
991	* Migrate it over to us with the special flag
992	* CTDB_CALL_FLAG_VACUUM_MIGRATION.
993	*/
994	static bool vacuum_fetch_process_one(struct ctdb_db_context *ctdb_db,
995	uint32_t pnn,
996	struct ctdb_rec_data_old *r)
997	{
998	struct ctdb_client_call_state *state;
999	TDB_DATA data;
1000	struct ctdb_ltdb_header *hdr;
1001	struct ctdb_call call;
1002
1003	ZERO_STRUCT(call);
1004	call.call_id = CTDB_NULL_FUNC;
1005	call.flags = CTDB_IMMEDIATE_MIGRATION;
1006	call.flags \|= CTDB_CALL_FLAG_VACUUM_MIGRATION;
1007
1008	call.key.dptr = &r->data[0];
1009	call.key.dsize = r->keylen;
1010
1011	/* ensure we don't block this daemon - just skip a record if we can't get
1012	the chainlock */
1013	if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, call.key) != 0) {
1014	return true;
1015	}
1016
1017	data = tdb_fetch(ctdb_db->ltdb->tdb, call.key);
1018	if (data.dptr == NULL) {
1019	tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1020	return true;
1021	}
1022
1023	if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
1024	free(data.dptr);
1025	tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1026	return true;
1027	}
1028
1029	hdr = (struct ctdb_ltdb_header *)data.dptr;
1030	if (hdr->dmaster == pnn) {
1031	/* its already local */
1032	free(data.dptr);
1033	tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1034	return true;
1035	}
1036
1037	free(data.dptr);
1038
1039	state = ctdb_call_send(ctdb_db, &call);
1040	tdb_chainunlock(ctdb_db->ltdb->tdb, call.key);
1041	if (state == NULL) {
1042	DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
1043	return false;
1044	}
1045	state->async.fn = vacuum_fetch_callback;
1046	state->async.private_data = NULL;
1047
1048	return true;
1049	}
1050
1051
1052	/*
1053	handler for vacuum fetch
1054	*/
1055	static void vacuum_fetch_handler(uint64_t srvid, TDB_DATA data,
1056	void *private_data)
1057	{
1058	struct ctdb_recoverd *rec = talloc_get_type(
1059	private_data, struct ctdb_recoverd);
1060	struct ctdb_context *ctdb = rec->ctdb;
1061	struct ctdb_marshall_buffer *recs;
1062	int ret, i;
1063	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1064	const char *name;
1065	struct ctdb_dbid_map_old *dbmap=NULL;
1066	bool persistent = false;
1067	struct ctdb_db_context *ctdb_db;
1068	struct ctdb_rec_data_old *r;
1069
1070	recs = (struct ctdb_marshall_buffer *)data.dptr;
1071
1072	if (recs->count == 0) {
1073	goto done;
1074	}
1075
1076	/* work out if the database is persistent */
1077	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
1078	if (ret != 0) {
1079	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
1080	goto done;
1081	}
1082
1083	for (i=0;i<dbmap->num;i++) {
1084	if (dbmap->dbs[i].db_id == recs->db_id) {
1085	persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
1086	break;
1087	}
1088	}
1089	if (i == dbmap->num) {
1090	DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
1091	goto done;
1092	}
1093
1094	/* find the name of this database */
1095	if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
1096	DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
1097	goto done;
1098	}
1099
1100	/* attach to it */
1101	ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
1102	if (ctdb_db == NULL) {
1103	DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
1104	goto done;
1105	}
1106
1107	r = (struct ctdb_rec_data_old *)&recs->data[0];
1108	while (recs->count) {
1109	bool ok;
1110
1111	ok = vacuum_fetch_process_one(ctdb_db, rec->ctdb->pnn, r);
1112	if (!ok) {
1113	break;
1114	}
1115
1116	r = (struct ctdb_rec_data_old )(r->length + (uint8_t )r);
1117	recs->count--;
1118	}
1119
1120	done:
1121	talloc_free(tmp_ctx);
1122	}
1123
1124
1125	/*
1126	* handler for database detach
1127	*/
1128	static void detach_database_handler(uint64_t srvid, TDB_DATA data,
1129	void *private_data)
1130	{
1131	struct ctdb_recoverd *rec = talloc_get_type(
1132	private_data, struct ctdb_recoverd);
1133	struct ctdb_context *ctdb = rec->ctdb;
1134	uint32_t db_id;
1135	struct ctdb_db_context *ctdb_db;
1136
1137	if (data.dsize != sizeof(db_id)) {
1138	return;
1139	}
1140	db_id = (uint32_t )data.dptr;
1141
1142	ctdb_db = find_ctdb_db(ctdb, db_id);
1143	if (ctdb_db == NULL) {
1144	/* database is not attached */
1145	return;
1146	}
1147
1148	DLIST_REMOVE(ctdb->db_list, ctdb_db);
1149
1150	DEBUG(DEBUG_NOTICE, ("Detached from database '%s'\n",
1151	ctdb_db->db_name));
1152	talloc_free(ctdb_db);
1153	}
1154
1155	/*
1156	called when ctdb_wait_timeout should finish
1157	*/
1158	static void ctdb_wait_handler(struct tevent_context *ev,
1159	struct tevent_timer *te,
1160	struct timeval yt, void *p)
1161	{
1162	uint32_t timed_out = (uint32_t )p;
1163	(*timed_out) = 1;
1164	}
1165
1166	/*
1167	wait for a given number of seconds
1168	*/
1169	static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
1170	{
1171	uint32_t timed_out = 0;
1172	time_t usecs = (secs - (time_t)secs) * 1000000;
1173	tevent_add_timer(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs),
1174	ctdb_wait_handler, &timed_out);
1175	while (!timed_out) {
1176	tevent_loop_once(ctdb->ev);
1177	}
1178	}
1179
1180	/*
1181	called when an election times out (ends)
1182	*/
1183	static void ctdb_election_timeout(struct tevent_context *ev,
1184	struct tevent_timer *te,
1185	struct timeval t, void *p)
1186	{
1187	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
1188	rec->election_timeout = NULL;
1189	fast_start = false;
1190
1191	DEBUG(DEBUG_WARNING,("Election period ended\n"));
1192	}
1193
1194
1195	/*
1196	wait for an election to finish. It finished election_timeout seconds after
1197	the last election packet is received
1198	*/
1199	static void ctdb_wait_election(struct ctdb_recoverd *rec)
1200	{
1201	struct ctdb_context *ctdb = rec->ctdb;
1202	while (rec->election_timeout) {
1203	tevent_loop_once(ctdb->ev);
1204	}
1205	}
1206
1207	/*
1208	Update our local flags from all remote connected nodes.
1209	This is only run when we are or we belive we are the recovery master
1210	*/
1211	static int update_local_flags(struct ctdb_recoverd rec, struct ctdb_node_map_old nodemap)
1212	{
1213	int j;
1214	struct ctdb_context *ctdb = rec->ctdb;
1215	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
1216
1217	/* get the nodemap for all active remote nodes and verify
1218	they are the same as for this node
1219	*/
1220	for (j=0; j<nodemap->num; j++) {
1221	struct ctdb_node_map_old *remote_nodemap=NULL;
1222	int ret;
1223
1224	if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
1225	continue;
1226	}
1227	if (nodemap->nodes[j].pnn == ctdb->pnn) {
1228	continue;
1229	}
1230
1231	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
1232	mem_ctx, &remote_nodemap);
1233	if (ret != 0) {
1234	DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n",
1235	nodemap->nodes[j].pnn));
1236	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
1237	talloc_free(mem_ctx);
1238	return MONITOR_FAILED;
1239	}
1240	if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
1241	/* We should tell our daemon about this so it
1242	updates its flags or else we will log the same
1243	message again in the next iteration of recovery.
1244	Since we are the recovery master we can just as
1245	well update the flags on all nodes.
1246	*/
1247	ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
1248	if (ret != 0) {
1249	DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
1250	return -1;
1251	}
1252
1253	/* Update our local copy of the flags in the recovery
1254	daemon.
1255	*/
1256	DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
1257	nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
1258	nodemap->nodes[j].flags));
1259	nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
1260	}
1261	talloc_free(remote_nodemap);
1262	}
1263	talloc_free(mem_ctx);
1264	return MONITOR_OK;
1265	}
1266
1267
1268	/* Create a new random generation id.
1269	The generation id can not be the INVALID_GENERATION id
1270	*/
1271	static uint32_t new_generation(void)
1272	{
1273	uint32_t generation;
1274
1275	while (1) {
1276	generation = random();
1277
1278	if (generation != INVALID_GENERATION) {
1279	break;
1280	}
1281	}
1282
1283	return generation;
1284	}
1285
1286
1287	/*
1288	create a temporary working database
1289	*/
1290	static struct tdb_wrap create_recdb(struct ctdb_context ctdb, TALLOC_CTX *mem_ctx)
1291	{
1292	char *name;
1293	struct tdb_wrap *recdb;
1294	unsigned tdb_flags;
1295
1296	/* open up the temporary recovery database */
1297	name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
1298	ctdb->db_directory_state,
1299	ctdb->pnn);
1300	if (name == NULL) {
1301	return NULL;
1302	}
1303	unlink(name);
1304
1305	tdb_flags = TDB_NOLOCK;
1306	if (ctdb->valgrinding) {
1307	tdb_flags \|= TDB_NOMMAP;
1308	}
1309	tdb_flags \|= (TDB_INCOMPATIBLE_HASH \| TDB_DISALLOW_NESTING);
1310
1311	recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size,
1312	tdb_flags, O_RDWR\|O_CREAT\|O_EXCL, 0600);
1313	if (recdb == NULL) {
1314	DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
1315	}
1316
1317	talloc_free(name);
1318
1319	return recdb;
1320	}
1321
1322
1323	/*
1324	a traverse function for pulling all relevant records from recdb
1325	*/
1326	struct recdb_data {
1327	struct ctdb_context *ctdb;
1328	struct ctdb_marshall_buffer *recdata;
1329	uint32_t len;
1330	uint32_t allocated_len;
1331	bool failed;
1332	bool persistent;
1333	};
1334
1335	static int traverse_recdb(struct tdb_context tdb, TDB_DATA key, TDB_DATA data, void p)
1336	{
1337	struct recdb_data params = (struct recdb_data )p;
1338	struct ctdb_rec_data_old *recdata;
1339	struct ctdb_ltdb_header *hdr;
1340
1341	/*
1342	* skip empty records - but NOT for persistent databases:
1343	*
1344	* The record-by-record mode of recovery deletes empty records.
1345	* For persistent databases, this can lead to data corruption
1346	* by deleting records that should be there:
1347	*
1348	* - Assume the cluster has been running for a while.
1349	*
1350	* - A record R in a persistent database has been created and
1351	* deleted a couple of times, the last operation being deletion,
1352	* leaving an empty record with a high RSN, say 10.
1353	*
1354	* - Now a node N is turned off.
1355	*
1356	* - This leaves the local database copy of D on N with the empty
1357	* copy of R and RSN 10. On all other nodes, the recovery has deleted
1358	* the copy of record R.
1359	*
1360	* - Now the record is created again while node N is turned off.
1361	* This creates R with RSN = 1 on all nodes except for N.
1362	*
1363	* - Now node N is turned on again. The following recovery will chose
1364	* the older empty copy of R due to RSN 10 > RSN 1.
1365	*
1366	* ==> Hence the record is gone after the recovery.
1367	*
1368	* On databases like Samba's registry, this can damage the higher-level
1369	* data structures built from the various tdb-level records.
1370	*/
1371	if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
1372	return 0;
1373	}
1374
1375	/* update the dmaster field to point to us */
1376	hdr = (struct ctdb_ltdb_header *)data.dptr;
1377	if (!params->persistent) {
1378	hdr->dmaster = params->ctdb->pnn;
1379	hdr->flags \|= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
1380	}
1381
1382	/* add the record to the blob ready to send to the nodes */
1383	recdata = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
1384	if (recdata == NULL) {
1385	params->failed = true;
1386	return -1;
1387	}
1388	if (params->len + recdata->length >= params->allocated_len) {
1389	params->allocated_len = recdata->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
1390	params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
1391	}
1392	if (params->recdata == NULL) {
1393	DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
1394	recdata->length + params->len));
1395	params->failed = true;
1396	return -1;
1397	}
1398	params->recdata->count++;
1399	memcpy(params->len+(uint8_t *)params->recdata, recdata, recdata->length);
1400	params->len += recdata->length;
1401	talloc_free(recdata);
1402
1403	return 0;
1404	}
1405
1406	/*
1407	push the recdb database out to all nodes
1408	*/
1409	static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
1410	bool persistent,
1411	struct tdb_wrap recdb, struct ctdb_node_map_old nodemap)
1412	{
1413	struct recdb_data params;
1414	struct ctdb_marshall_buffer *recdata;
1415	TDB_DATA outdata;
1416	TALLOC_CTX *tmp_ctx;
1417	uint32_t *nodes;
1418
1419	tmp_ctx = talloc_new(ctdb);
1420	CTDB_NO_MEMORY(ctdb, tmp_ctx);
1421
1422	recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
1423	CTDB_NO_MEMORY(ctdb, recdata);
1424
1425	recdata->db_id = dbid;
1426
1427	params.ctdb = ctdb;
1428	params.recdata = recdata;
1429	params.len = offsetof(struct ctdb_marshall_buffer, data);
1430	params.allocated_len = params.len;
1431	params.failed = false;
1432	params.persistent = persistent;
1433
1434	if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
1435	DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1436	talloc_free(params.recdata);
1437	talloc_free(tmp_ctx);
1438	return -1;
1439	}
1440
1441	if (params.failed) {
1442	DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
1443	talloc_free(params.recdata);
1444	talloc_free(tmp_ctx);
1445	return -1;
1446	}
1447
1448	recdata = params.recdata;
1449
1450	outdata.dptr = (void *)recdata;
1451	outdata.dsize = params.len;
1452
1453	nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
1454	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
1455	nodes, 0,
1456	CONTROL_TIMEOUT(), false, outdata,
1457	NULL, NULL,
1458	NULL) != 0) {
1459	DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
1460	talloc_free(recdata);
1461	talloc_free(tmp_ctx);
1462	return -1;
1463	}
1464
1465	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n",
1466	dbid, recdata->count));
1467
1468	talloc_free(recdata);
1469	talloc_free(tmp_ctx);
1470
1471	return 0;
1472	}
1473
1474
1475	/*
1476	go through a full recovery on one database
1477	*/
1478	static int recover_database(struct ctdb_recoverd *rec,
1479	TALLOC_CTX *mem_ctx,
1480	uint32_t dbid,
1481	bool persistent,
1482	uint32_t pnn,
1483	struct ctdb_node_map_old *nodemap,
1484	uint32_t transaction_id)
1485	{
1486	struct tdb_wrap *recdb;
1487	int ret;
1488	struct ctdb_context *ctdb = rec->ctdb;
1489	TDB_DATA data;
1490	struct ctdb_transdb w;
1491	uint32_t *nodes;
1492
1493	recdb = create_recdb(ctdb, mem_ctx);
1494	if (recdb == NULL) {
1495	return -1;
1496	}
1497
1498	/* pull all remote databases onto the recdb */
1499	ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
1500	if (ret != 0) {
1501	DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
1502	return -1;
1503	}
1504
1505	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
1506
1507	/* wipe all the remote databases. This is safe as we are in a transaction */
1508	w.db_id = dbid;
1509	w.tid = transaction_id;
1510
1511	data.dptr = (void *)&w;
1512	data.dsize = sizeof(w);
1513
1514	nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
1515	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
1516	nodes, 0,
1517	CONTROL_TIMEOUT(), false, data,
1518	NULL, NULL,
1519	NULL) != 0) {
1520	DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
1521	talloc_free(recdb);
1522	return -1;
1523	}
1524
1525	/* push out the correct database. This sets the dmaster and skips
1526	the empty records */
1527	ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
1528	if (ret != 0) {
1529	talloc_free(recdb);
1530	return -1;
1531	}
1532
1533	/* all done with this database */
1534	talloc_free(recdb);
1535
1536	return 0;
1537	}
1538
1539	/* when we start a recovery, make sure all nodes use the same reclock file
1540	setting
1541	*/
1542	static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
1543	{
1544	struct ctdb_context *ctdb = rec->ctdb;
1545	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
1546	TDB_DATA data;
1547	uint32_t *nodes;
1548
1549	if (ctdb->recovery_lock_file == NULL) {
1550	data.dptr = NULL;
1551	data.dsize = 0;
1552	} else {
1553	data.dsize = strlen(ctdb->recovery_lock_file) + 1;
1554	data.dptr = (uint8_t *)ctdb->recovery_lock_file;
1555	}
1556
1557	nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
1558	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
1559	nodes, 0,
1560	CONTROL_TIMEOUT(),
1561	false, data,
1562	NULL, NULL,
1563	rec) != 0) {
1564	DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
1565	talloc_free(tmp_ctx);
1566	return -1;
1567	}
1568
1569	talloc_free(tmp_ctx);
1570	return 0;
1571	}
1572
1573
1574	/*
1575	* this callback is called for every node that failed to execute ctdb_takeover_run()
1576	* and set flag to re-run takeover run.
1577	*/
1578	static void takeover_fail_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
1579	{
1580	DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
1581
1582	if (callback_data != NULL) {
1583	struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
1584
1585	DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
1586
1587	ctdb_set_culprit(rec, node_pnn);
1588	}
1589	}
1590
1591
1592	static void ban_misbehaving_nodes(struct ctdb_recoverd rec, bool self_ban)
1593	{
1594	struct ctdb_context *ctdb = rec->ctdb;
1595	int i;
1596	struct ctdb_banning_state *ban_state;
1597
1598	*self_ban = false;
1599	for (i=0; i<ctdb->num_nodes; i++) {
1600	if (ctdb->nodes[i]->ban_state == NULL) {
1601	continue;
1602	}
1603	ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
1604	if (ban_state->count < 2*ctdb->num_nodes) {
1605	continue;
1606	}
1607
1608	DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
1609	ctdb->nodes[i]->pnn, ban_state->count,
1610	ctdb->tunable.recovery_ban_period));
1611	ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
1612	ban_state->count = 0;
1613
1614	/* Banning ourself? */
1615	if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
1616	*self_ban = true;
1617	}
1618	}
1619	}
1620
1621	static bool do_takeover_run(struct ctdb_recoverd *rec,
1622	struct ctdb_node_map_old *nodemap,
1623	bool banning_credits_on_fail)
1624	{
1625	uint32_t *nodes = NULL;
1626	struct ctdb_disable_message dtr;
1627	TDB_DATA data;
1628	int i;
1629	uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
1630	int ret;
1631	bool ok;
1632
1633	DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
1634
1635	if (ctdb_op_is_in_progress(rec->takeover_run)) {
1636	DEBUG(DEBUG_ERR, (__location__
1637	" takeover run already in progress \n"));
1638	ok = false;
1639	goto done;
1640	}
1641
1642	if (!ctdb_op_begin(rec->takeover_run)) {
1643	ok = false;
1644	goto done;
1645	}
1646
1647	/* Disable IP checks (takeover runs, really) on other nodes
1648	* while doing this takeover run. This will stop those other
1649	* nodes from triggering takeover runs when think they should
1650	* be hosting an IP but it isn't yet on an interface. Don't
1651	* wait for replies since a failure here might cause some
1652	* noise in the logs but will not actually cause a problem.
1653	*/
1654	ZERO_STRUCT(dtr);
1655	dtr.srvid = 0; /* No reply */
1656	dtr.pnn = -1;
1657
1658	data.dptr = (uint8_t*)&dtr;
1659	data.dsize = sizeof(dtr);
1660
1661	nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
1662
1663	/* Disable for 60 seconds. This can be a tunable later if
1664	* necessary.
1665	*/
1666	dtr.timeout = 60;
1667	for (i = 0; i < talloc_array_length(nodes); i++) {
1668	if (ctdb_client_send_message(rec->ctdb, nodes[i],
1669	CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1670	data) != 0) {
1671	DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
1672	}
1673	}
1674
1675	ret = ctdb_takeover_run(rec->ctdb, nodemap,
1676	rec->force_rebalance_nodes,
1677	takeover_fail_callback,
1678	banning_credits_on_fail ? rec : NULL);
1679
1680	/* Reenable takeover runs and IP checks on other nodes */
1681	dtr.timeout = 0;
1682	for (i = 0; i < talloc_array_length(nodes); i++) {
1683	if (ctdb_client_send_message(rec->ctdb, nodes[i],
1684	CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
1685	data) != 0) {
1686	DEBUG(DEBUG_INFO,("Failed to re-enable takeover runs\n"));
1687	}
1688	}
1689
1690	if (ret != 0) {
1691	DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
1692	ok = false;
1693	goto done;
1694	}
1695
1696	ok = true;
1697	/* Takeover run was successful so clear force rebalance targets */
1698	if (rebalance_nodes == rec->force_rebalance_nodes) {
1699	TALLOC_FREE(rec->force_rebalance_nodes);
1700	} else {
1701	DEBUG(DEBUG_WARNING,
1702	("Rebalance target nodes changed during takeover run - not clearing\n"));
1703	}
1704	done:
1705	rec->need_takeover_run = !ok;
1706	talloc_free(nodes);
1707	ctdb_op_end(rec->takeover_run);
1708
1709	DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
1710	return ok;
1711	}
1712
1713	struct recovery_helper_state {
1714	int fd[2];
1715	pid_t pid;
1716	int result;
1717	bool done;
1718	};
1719
1720	static void ctdb_recovery_handler(struct tevent_context *ev,
1721	struct tevent_fd *fde,
1722	uint16_t flags, void *private_data)
1723	{
1724	struct recovery_helper_state *state = talloc_get_type_abort(
1725	private_data, struct recovery_helper_state);
1726	int ret;
1727
1728	ret = sys_read(state->fd[0], &state->result, sizeof(state->result));
1729	if (ret != sizeof(state->result)) {
1730	state->result = EPIPE;
1731	}
1732
1733	state->done = true;
1734	}
1735
1736
1737	static int db_recovery_parallel(struct ctdb_recoverd rec, TALLOC_CTX mem_ctx)
1738	{
1739	static char prog[PATH_MAX+1] = "";
1740	const char **args;
1741	struct recovery_helper_state *state;
1742	struct tevent_fd *fde;
1743	int nargs, ret;
1744
1745	if (!ctdb_set_helper("recovery_helper", prog, sizeof(prog),
1746	"CTDB_RECOVERY_HELPER", CTDB_HELPER_BINDIR,
1747	"ctdb_recovery_helper")) {
1748	ctdb_die(rec->ctdb, "Unable to set recovery helper\n");
1749	}
1750
1751	state = talloc_zero(mem_ctx, struct recovery_helper_state);
1752	if (state == NULL) {
1753	DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1754	return -1;
1755	}
1756
1757	state->pid = -1;
1758
1759	ret = pipe(state->fd);
1760	if (ret != 0) {
1761	DEBUG(DEBUG_ERR,
1762	("Failed to create pipe for recovery helper\n"));
1763	goto fail;
1764	}
1765
1766	set_close_on_exec(state->fd[0]);
1767
1768	nargs = 4;
1769	args = talloc_array(state, const char *, nargs);
1770	if (args == NULL) {
1771	DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1772	goto fail;
1773	}
1774
1775	args[0] = talloc_asprintf(args, "%d", state->fd[1]);
1776	args[1] = rec->ctdb->daemon.name;
1777	args[2] = talloc_asprintf(args, "%u", new_generation());
1778	args[3] = NULL;
1779
1780	if (args[0] == NULL \|\| args[2] == NULL) {
1781	DEBUG(DEBUG_ERR, (__location__ " memory error\n"));
1782	goto fail;
1783	}
1784
1785	setenv("CTDB_DBDIR_STATE", rec->ctdb->db_directory_state, 1);
1786
1787	if (!ctdb_vfork_with_logging(state, rec->ctdb, "recovery", prog, nargs,
1788	args, NULL, NULL, &state->pid)) {
1789	DEBUG(DEBUG_ERR,
1790	("Failed to create child for recovery helper\n"));
1791	goto fail;
1792	}
1793
1794	close(state->fd[1]);
1795	state->fd[1] = -1;
1796
1797	state->done = false;
1798
1799	fde = tevent_add_fd(rec->ctdb->ev, rec->ctdb, state->fd[0],
1800	TEVENT_FD_READ, ctdb_recovery_handler, state);
1801	if (fde == NULL) {
1802	goto fail;
1803	}
1804	tevent_fd_set_auto_close(fde);
1805
1806	while (!state->done) {
1807	tevent_loop_once(rec->ctdb->ev);
1808	}
1809
1810	close(state->fd[0]);
1811	state->fd[0] = -1;
1812
1813	if (state->result != 0) {
1814	goto fail;
1815	}
1816
1817	ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1818	talloc_free(state);
1819	return 0;
1820
1821	fail:
1822	if (state->fd[0] != -1) {
1823	close(state->fd[0]);
1824	}
1825	if (state->fd[1] != -1) {
1826	close(state->fd[1]);
1827	}
1828	if (state->pid != -1) {
1829	ctdb_kill(rec->ctdb, state->pid, SIGKILL);
1830	}
1831	talloc_free(state);
1832	return -1;
1833	}
1834
1835	static int db_recovery_serial(struct ctdb_recoverd rec, TALLOC_CTX mem_ctx,
1836	uint32_t pnn, struct ctdb_node_map_old *nodemap,
1837	struct ctdb_vnn_map *vnnmap,
1838	struct ctdb_dbid_map_old *dbmap)
1839	{
1840	struct ctdb_context *ctdb = rec->ctdb;
1841	uint32_t generation;
1842	TDB_DATA data;
1843	uint32_t *nodes;
1844	int ret, i, j;
1845
1846	/* set recovery mode to active on all nodes */
1847	ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, true);
1848	if (ret != 0) {
1849	DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
1850	return -1;
1851	}
1852
1853	/* execute the "startrecovery" event script on all nodes */
1854	ret = run_startrecovery_eventscript(rec, nodemap);
1855	if (ret!=0) {
1856	DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
1857	return -1;
1858	}
1859
1860	/* pick a new generation number */
1861	generation = new_generation();
1862
1863	/* change the vnnmap on this node to use the new generation
1864	number but not on any other nodes.
1865	this guarantees that if we abort the recovery prematurely
1866	for some reason (a node stops responding?)
1867	that we can just return immediately and we will reenter
1868	recovery shortly again.
1869	I.e. we deliberately leave the cluster with an inconsistent
1870	generation id to allow us to abort recovery at any stage and
1871	just restart it from scratch.
1872	*/
1873	vnnmap->generation = generation;
1874	ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
1875	if (ret != 0) {
1876	DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
1877	return -1;
1878	}
1879
1880	/* Database generations are updated when the transaction is commited to
1881	* the databases. So make sure to use the final generation as the
1882	* transaction id
1883	*/
1884	generation = new_generation();
1885
1886	data.dptr = (void *)&generation;
1887	data.dsize = sizeof(uint32_t);
1888
1889	nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
1890	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
1891	nodes, 0,
1892	CONTROL_TIMEOUT(), false, data,
1893	NULL,
1894	transaction_start_fail_callback,
1895	rec) != 0) {
1896	DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
1897	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
1898	nodes, 0,
1899	CONTROL_TIMEOUT(), false, tdb_null,
1900	NULL,
1901	NULL,
1902	NULL) != 0) {
1903	DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
1904	}
1905	return -1;
1906	}
1907
1908	DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
1909
1910	for (i=0;i<dbmap->num;i++) {
1911	ret = recover_database(rec, mem_ctx,
1912	dbmap->dbs[i].db_id,
1913	dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
1914	pnn, nodemap, generation);
1915	if (ret != 0) {
1916	DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].db_id));
1917	return -1;
1918	}
1919	}
1920
1921	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
1922
1923	/* commit all the changes */
1924	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
1925	nodes, 0,
1926	CONTROL_TIMEOUT(), false, data,
1927	NULL, NULL,
1928	NULL) != 0) {
1929	DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
1930	return -1;
1931	}
1932
1933	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
1934
1935	/* build a new vnn map with all the currently active and
1936	unbanned nodes */
1937	vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
1938	CTDB_NO_MEMORY(ctdb, vnnmap);
1939	vnnmap->generation = generation;
1940	vnnmap->size = 0;
1941	vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
1942	CTDB_NO_MEMORY(ctdb, vnnmap->map);
1943	for (i=j=0;i<nodemap->num;i++) {
1944	if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1945	continue;
1946	}
1947	if (!ctdb_node_has_capabilities(rec->caps,
1948	ctdb->nodes[i]->pnn,
1949	CTDB_CAP_LMASTER)) {
1950	/* this node can not be an lmaster */
1951	DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
1952	continue;
1953	}
1954
1955	vnnmap->size++;
1956	vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1957	CTDB_NO_MEMORY(ctdb, vnnmap->map);
1958	vnnmap->map[j++] = nodemap->nodes[i].pnn;
1959
1960	}
1961	if (vnnmap->size == 0) {
1962	DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
1963	vnnmap->size++;
1964	vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
1965	CTDB_NO_MEMORY(ctdb, vnnmap->map);
1966	vnnmap->map[0] = pnn;
1967	}
1968
1969	/* update to the new vnnmap on all nodes */
1970	ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
1971	if (ret != 0) {
1972	DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
1973	return -1;
1974	}
1975
1976	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
1977
1978	/* disable recovery mode */
1979	ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL, false);
1980	if (ret != 0) {
1981	DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
1982	return -1;
1983	}
1984
1985	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
1986
1987	/* execute the "recovered" event script on all nodes */
1988	ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
1989	if (ret!=0) {
1990	DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
1991	return -1;
1992	}
1993
1994	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
1995
1996	return 0;
1997	}
1998
1999	/*
2000	we are the recmaster, and recovery is needed - start a recovery run
2001	*/
2002	static int do_recovery(struct ctdb_recoverd *rec,
2003	TALLOC_CTX *mem_ctx, uint32_t pnn,
2004	struct ctdb_node_map_old nodemap, struct ctdb_vnn_map vnnmap)
2005	{
2006	struct ctdb_context *ctdb = rec->ctdb;
2007	int i, ret;
2008	struct ctdb_dbid_map_old *dbmap;
2009	struct timeval start_time;
2010	bool self_ban;
2011	bool par_recovery;
2012
2013	DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
2014
2015	/* Check if the current node is still the recmaster. It's possible that
2016	* re-election has changed the recmaster.
2017	*/
2018	if (pnn != rec->recmaster) {
2019	DEBUG(DEBUG_NOTICE,
2020	("Recovery master changed to %u, aborting recovery\n",
2021	rec->recmaster));
2022	return -1;
2023	}
2024
2025	/* if recovery fails, force it again */
2026	rec->need_recovery = true;
2027
2028	if (!ctdb_op_begin(rec->recovery)) {
2029	return -1;
2030	}
2031
2032	if (rec->election_timeout) {
2033	/* an election is in progress */
2034	DEBUG(DEBUG_ERR, ("do_recovery called while election in progress - try again later\n"));
2035	goto fail;
2036	}
2037
2038	ban_misbehaving_nodes(rec, &self_ban);
2039	if (self_ban) {
2040	DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
2041	goto fail;
2042	}
2043
2044	if (ctdb->recovery_lock_file != NULL) {
2045	if (ctdb_recovery_have_lock(ctdb)) {
2046	DEBUG(DEBUG_NOTICE, ("Already holding recovery lock\n"));
2047	} else {
2048	start_time = timeval_current();
2049	DEBUG(DEBUG_NOTICE, ("Attempting to take recovery lock (%s)\n",
2050	ctdb->recovery_lock_file));
2051	if (!ctdb_recovery_lock(ctdb)) {
2052	if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
2053	/* If ctdb is trying first recovery, it's
2054	* possible that current node does not know
2055	* yet who the recmaster is.
2056	*/
2057	DEBUG(DEBUG_ERR, ("Unable to get recovery lock"
2058	" - retrying recovery\n"));
2059	goto fail;
2060	}
2061
2062	DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
2063	"and ban ourself for %u seconds\n",
2064	ctdb->tunable.recovery_ban_period));
2065	ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
2066	goto fail;
2067	}
2068	ctdb_ctrl_report_recd_lock_latency(ctdb,
2069	CONTROL_TIMEOUT(),
2070	timeval_elapsed(&start_time));
2071	DEBUG(DEBUG_NOTICE,
2072	("Recovery lock taken successfully by recovery daemon\n"));
2073	}
2074	}
2075
2076	DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
2077
2078	/* get a list of all databases */
2079	ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
2080	if (ret != 0) {
2081	DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
2082	goto fail;
2083	}
2084
2085	/* we do the db creation before we set the recovery mode, so the freeze happens
2086	on all databases we will be dealing with. */
2087
2088	/* verify that we have all the databases any other node has */
2089	ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
2090	if (ret != 0) {
2091	DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
2092	goto fail;
2093	}
2094
2095	/* verify that all other nodes have all our databases */
2096	ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
2097	if (ret != 0) {
2098	DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
2099	goto fail;
2100	}
2101	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
2102
2103	/* update the database priority for all remote databases */
2104	ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
2105	if (ret != 0) {
2106	DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
2107	}
2108	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
2109
2110
2111	/* update all other nodes to use the same setting for reclock files
2112	as the local recovery master.
2113	*/
2114	sync_recovery_lock_file_across_cluster(rec);
2115
2116	/* Retrieve capabilities from all connected nodes */
2117	ret = update_capabilities(rec, nodemap);
2118	if (ret!=0) {
2119	DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
2120	return -1;
2121	}
2122
2123	/*
2124	update all nodes to have the same flags that we have
2125	*/
2126	for (i=0;i<nodemap->num;i++) {
2127	if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2128	continue;
2129	}
2130
2131	ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
2132	if (ret != 0) {
2133	if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2134	DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
2135	} else {
2136	DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
2137	return -1;
2138	}
2139	}
2140	}
2141
2142	DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
2143
2144	/* Check if all participating nodes have parallel recovery capability */
2145	par_recovery = true;
2146	for (i=0; i<nodemap->num; i++) {
2147	if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
2148	continue;
2149	}
2150
2151	if (!(rec->caps[i].capabilities &
2152	CTDB_CAP_PARALLEL_RECOVERY)) {
2153	par_recovery = false;
2154	break;
2155	}
2156	}
2157
2158	if (par_recovery) {
2159	ret = db_recovery_parallel(rec, mem_ctx);
2160	} else {
2161	ret = db_recovery_serial(rec, mem_ctx, pnn, nodemap, vnnmap,
2162	dbmap);
2163	}
2164
2165	if (ret != 0) {
2166	goto fail;
2167	}
2168
2169	do_takeover_run(rec, nodemap, false);
2170
2171	/* send a message to all clients telling them that the cluster
2172	has been reconfigured */
2173	ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
2174	CTDB_SRVID_RECONFIGURE, tdb_null);
2175	if (ret != 0) {
2176	DEBUG(DEBUG_ERR, (__location__ " Failed to send reconfigure message\n"));
2177	goto fail;
2178	}
2179
2180	DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
2181
2182	rec->need_recovery = false;
2183	ctdb_op_end(rec->recovery);
2184
2185	/* we managed to complete a full recovery, make sure to forgive
2186	any past sins by the nodes that could now participate in the
2187	recovery.
2188	*/
2189	DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
2190	for (i=0;i<nodemap->num;i++) {
2191	struct ctdb_banning_state *ban_state;
2192
2193	if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
2194	continue;
2195	}
2196
2197	ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
2198	if (ban_state == NULL) {
2199	continue;
2200	}
2201
2202	ban_state->count = 0;
2203	}
2204
2205	/* We just finished a recovery successfully.
2206	We now wait for rerecovery_timeout before we allow
2207	another recovery to take place.
2208	*/
2209	DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
2210	ctdb_op_disable(rec->recovery, ctdb->ev,
2211	ctdb->tunable.rerecovery_timeout);
2212	return 0;
2213
2214	fail:
2215	ctdb_op_end(rec->recovery);
2216	return -1;
2217	}
2218
2219
2220	/*
2221	elections are won by first checking the number of connected nodes, then
2222	the priority time, then the pnn
2223	*/
2224	struct election_message {
2225	uint32_t num_connected;
2226	struct timeval priority_time;
2227	uint32_t pnn;
2228	uint32_t node_flags;
2229	};
2230
2231	/*
2232	form this nodes election data
2233	*/
2234	static void ctdb_election_data(struct ctdb_recoverd rec, struct election_message em)
2235	{
2236	int ret, i;
2237	struct ctdb_node_map_old *nodemap;
2238	struct ctdb_context *ctdb = rec->ctdb;
2239
2240	ZERO_STRUCTP(em);
2241
2242	em->pnn = rec->ctdb->pnn;
2243	em->priority_time = rec->priority_time;
2244
2245	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
2246	if (ret != 0) {
2247	DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
2248	return;
2249	}
2250
2251	rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
2252	em->node_flags = rec->node_flags;
2253
2254	for (i=0;i<nodemap->num;i++) {
2255	if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
2256	em->num_connected++;
2257	}
2258	}
2259
2260	/* we shouldnt try to win this election if we cant be a recmaster */
2261	if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2262	em->num_connected = 0;
2263	em->priority_time = timeval_current();
2264	}
2265
2266	talloc_free(nodemap);
2267	}
2268
2269	/*
2270	see if the given election data wins
2271	*/
2272	static bool ctdb_election_win(struct ctdb_recoverd rec, struct election_message em)
2273	{
2274	struct election_message myem;
2275	int cmp = 0;
2276
2277	ctdb_election_data(rec, &myem);
2278
2279	/* we cant win if we don't have the recmaster capability */
2280	if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
2281	return false;
2282	}
2283
2284	/* we cant win if we are banned */
2285	if (rec->node_flags & NODE_FLAGS_BANNED) {
2286	return false;
2287	}
2288
2289	/* we cant win if we are stopped */
2290	if (rec->node_flags & NODE_FLAGS_STOPPED) {
2291	return false;
2292	}
2293
2294	/* we will automatically win if the other node is banned */
2295	if (em->node_flags & NODE_FLAGS_BANNED) {
2296	return true;
2297	}
2298
2299	/* we will automatically win if the other node is banned */
2300	if (em->node_flags & NODE_FLAGS_STOPPED) {
2301	return true;
2302	}
2303
2304	/* then the longest running node */
2305	if (cmp == 0) {
2306	cmp = timeval_compare(&em->priority_time, &myem.priority_time);
2307	}
2308
2309	if (cmp == 0) {
2310	cmp = (int)myem.pnn - (int)em->pnn;
2311	}
2312
2313	return cmp > 0;
2314	}
2315
2316	/*
2317	send out an election request
2318	*/
2319	static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
2320	{
2321	int ret;
2322	TDB_DATA election_data;
2323	struct election_message emsg;
2324	uint64_t srvid;
2325	struct ctdb_context *ctdb = rec->ctdb;
2326
2327	srvid = CTDB_SRVID_ELECTION;
2328
2329	ctdb_election_data(rec, &emsg);
2330
2331	election_data.dsize = sizeof(struct election_message);
2332	election_data.dptr = (unsigned char *)&emsg;
2333
2334
2335	/* first we assume we will win the election and set
2336	recoverymaster to be ourself on the current node
2337	*/
2338	ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2339	CTDB_CURRENT_NODE, pnn);
2340	if (ret != 0) {
2341	DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster\n"));
2342	return -1;
2343	}
2344	rec->recmaster = pnn;
2345
2346	/* send an election message to all active nodes */
2347	DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
2348	return ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
2349	}
2350
2351	/*
2352	we think we are winning the election - send a broadcast election request
2353	*/
2354	static void election_send_request(struct tevent_context *ev,
2355	struct tevent_timer *te,
2356	struct timeval t, void *p)
2357	{
2358	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2359	int ret;
2360
2361	ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
2362	if (ret != 0) {
2363	DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
2364	}
2365
2366	TALLOC_FREE(rec->send_election_te);
2367	}
2368
2369	/*
2370	handler for memory dumps
2371	*/
2372	static void mem_dump_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2373	{
2374	struct ctdb_recoverd *rec = talloc_get_type(
2375	private_data, struct ctdb_recoverd);
2376	struct ctdb_context *ctdb = rec->ctdb;
2377	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2378	TDB_DATA *dump;
2379	int ret;
2380	struct ctdb_srvid_message *rd;
2381
2382	if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2383	DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2384	talloc_free(tmp_ctx);
2385	return;
2386	}
2387	rd = (struct ctdb_srvid_message *)data.dptr;
2388
2389	dump = talloc_zero(tmp_ctx, TDB_DATA);
2390	if (dump == NULL) {
2391	DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
2392	talloc_free(tmp_ctx);
2393	return;
2394	}
2395	ret = ctdb_dump_memory(ctdb, dump);
2396	if (ret != 0) {
2397	DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
2398	talloc_free(tmp_ctx);
2399	return;
2400	}
2401
2402	DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));
2403
2404	ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
2405	if (ret != 0) {
2406	DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
2407	talloc_free(tmp_ctx);
2408	return;
2409	}
2410
2411	talloc_free(tmp_ctx);
2412	}
2413
2414	/*
2415	handler for reload_nodes
2416	*/
2417	static void reload_nodes_handler(uint64_t srvid, TDB_DATA data,
2418	void *private_data)
2419	{
2420	struct ctdb_recoverd *rec = talloc_get_type(
2421	private_data, struct ctdb_recoverd);
2422
2423	DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
2424
2425	ctdb_load_nodes_file(rec->ctdb);
2426	}
2427
2428
2429	static void ctdb_rebalance_timeout(struct tevent_context *ev,
2430	struct tevent_timer *te,
2431	struct timeval t, void *p)
2432	{
2433	struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
2434
2435	if (rec->force_rebalance_nodes == NULL) {
2436	DEBUG(DEBUG_ERR,
2437	("Rebalance timeout occurred - no nodes to rebalance\n"));
2438	return;
2439	}
2440
2441	DEBUG(DEBUG_NOTICE,
2442	("Rebalance timeout occurred - trigger takeover run\n"));
2443	rec->need_takeover_run = true;
2444	}
2445
2446
2447	static void recd_node_rebalance_handler(uint64_t srvid, TDB_DATA data,
2448	void *private_data)
2449	{
2450	struct ctdb_recoverd *rec = talloc_get_type(
2451	private_data, struct ctdb_recoverd);
2452	struct ctdb_context *ctdb = rec->ctdb;
2453	uint32_t pnn;
2454	uint32_t *t;
2455	int len;
2456	uint32_t deferred_rebalance;
2457
2458	if (rec->recmaster != ctdb_get_pnn(ctdb)) {
2459	return;
2460	}
2461
2462	if (data.dsize != sizeof(uint32_t)) {
2463	DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
2464	return;
2465	}
2466
2467	pnn = (uint32_t )&data.dptr[0];
2468
2469	DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
2470
2471	/* Copy any existing list of nodes. There's probably some
2472	* sort of realloc variant that will do this but we need to
2473	* make sure that freeing the old array also cancels the timer
2474	* event for the timeout... not sure if realloc will do that.
2475	*/
2476	len = (rec->force_rebalance_nodes != NULL) ?
2477	talloc_array_length(rec->force_rebalance_nodes) :
2478	0;
2479
2480	/* This allows duplicates to be added but they don't cause
2481	* harm. A call to add a duplicate PNN arguably means that
2482	* the timeout should be reset, so this is the simplest
2483	* solution.
2484	*/
2485	t = talloc_zero_array(rec, uint32_t, len+1);
2486	CTDB_NO_MEMORY_VOID(ctdb, t);
2487	if (len > 0) {
2488	memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
2489	}
2490	t[len] = pnn;
2491
2492	talloc_free(rec->force_rebalance_nodes);
2493
2494	rec->force_rebalance_nodes = t;
2495
2496	/* If configured, setup a deferred takeover run to make sure
2497	* that certain nodes get IPs rebalanced to them. This will
2498	* be cancelled if a successful takeover run happens before
2499	* the timeout. Assign tunable value to variable for
2500	* readability.
2501	*/
2502	deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
2503	if (deferred_rebalance != 0) {
2504	tevent_add_timer(ctdb->ev, rec->force_rebalance_nodes,
2505	timeval_current_ofs(deferred_rebalance, 0),
2506	ctdb_rebalance_timeout, rec);
2507	}
2508	}
2509
2510
2511
2512	static void recd_update_ip_handler(uint64_t srvid, TDB_DATA data,
2513	void *private_data)
2514	{
2515	struct ctdb_recoverd *rec = talloc_get_type(
2516	private_data, struct ctdb_recoverd);
2517	struct ctdb_public_ip *ip;
2518
2519	if (rec->recmaster != rec->ctdb->pnn) {
2520	DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
2521	return;
2522	}
2523
2524	if (data.dsize != sizeof(struct ctdb_public_ip)) {
2525	DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
2526	return;
2527	}
2528
2529	ip = (struct ctdb_public_ip *)data.dptr;
2530
2531	update_ip_assignment_tree(rec->ctdb, ip);
2532	}
2533
2534	static void srvid_disable_and_reply(struct ctdb_context *ctdb,
2535	TDB_DATA data,
2536	struct ctdb_op_state *op_state)
2537	{
2538	struct ctdb_disable_message *r;
2539	uint32_t timeout;
2540	TDB_DATA result;
2541	int32_t ret = 0;
2542
2543	/* Validate input data */
2544	if (data.dsize != sizeof(struct ctdb_disable_message)) {
2545	DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2546	"expecting %lu\n", (long unsigned)data.dsize,
2547	(long unsigned)sizeof(struct ctdb_srvid_message)));
2548	return;
2549	}
2550	if (data.dptr == NULL) {
2551	DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2552	return;
2553	}
2554
2555	r = (struct ctdb_disable_message *)data.dptr;
2556	timeout = r->timeout;
2557
2558	ret = ctdb_op_disable(op_state, ctdb->ev, timeout);
2559	if (ret != 0) {
2560	goto done;
2561	}
2562
2563	/* Returning our PNN tells the caller that we succeeded */
2564	ret = ctdb_get_pnn(ctdb);
2565	done:
2566	result.dsize = sizeof(int32_t);
2567	result.dptr = (uint8_t *)&ret;
2568	srvid_request_reply(ctdb, (struct ctdb_srvid_message *)r, result);
2569	}
2570
2571	static void disable_takeover_runs_handler(uint64_t srvid, TDB_DATA data,
2572	void *private_data)
2573	{
2574	struct ctdb_recoverd *rec = talloc_get_type(
2575	private_data, struct ctdb_recoverd);
2576
2577	srvid_disable_and_reply(rec->ctdb, data, rec->takeover_run);
2578	}
2579
2580	/* Backward compatibility for this SRVID */
2581	static void disable_ip_check_handler(uint64_t srvid, TDB_DATA data,
2582	void *private_data)
2583	{
2584	struct ctdb_recoverd *rec = talloc_get_type(
2585	private_data, struct ctdb_recoverd);
2586	uint32_t timeout;
2587
2588	if (data.dsize != sizeof(uint32_t)) {
2589	DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
2590	"expecting %lu\n", (long unsigned)data.dsize,
2591	(long unsigned)sizeof(uint32_t)));
2592	return;
2593	}
2594	if (data.dptr == NULL) {
2595	DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
2596	return;
2597	}
2598
2599	timeout = ((uint32_t )data.dptr);
2600
2601	ctdb_op_disable(rec->takeover_run, rec->ctdb->ev, timeout);
2602	}
2603
2604	static void disable_recoveries_handler(uint64_t srvid, TDB_DATA data,
2605	void *private_data)
2606	{
2607	struct ctdb_recoverd *rec = talloc_get_type(
2608	private_data, struct ctdb_recoverd);
2609
2610	srvid_disable_and_reply(rec->ctdb, data, rec->recovery);
2611	}
2612
2613	/*
2614	handler for ip reallocate, just add it to the list of requests and
2615	handle this later in the monitor_cluster loop so we do not recurse
2616	with other requests to takeover_run()
2617	*/
2618	static void ip_reallocate_handler(uint64_t srvid, TDB_DATA data,
2619	void *private_data)
2620	{
2621	struct ctdb_srvid_message *request;
2622	struct ctdb_recoverd *rec = talloc_get_type(
2623	private_data, struct ctdb_recoverd);
2624
2625	if (data.dsize != sizeof(struct ctdb_srvid_message)) {
2626	DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
2627	return;
2628	}
2629
2630	request = (struct ctdb_srvid_message *)data.dptr;
2631
2632	srvid_request_add(rec->ctdb, &rec->reallocate_requests, request);
2633	}
2634
2635	static void process_ipreallocate_requests(struct ctdb_context *ctdb,
2636	struct ctdb_recoverd *rec)
2637	{
2638	TDB_DATA result;
2639	int32_t ret;
2640	struct srvid_requests *current;
2641
2642	DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
2643
2644	/* Only process requests that are currently pending. More
2645	* might come in while the takeover run is in progress and
2646	* they will need to be processed later since they might
2647	* be in response flag changes.
2648	*/
2649	current = rec->reallocate_requests;
2650	rec->reallocate_requests = NULL;
2651
2652	if (do_takeover_run(rec, rec->nodemap, false)) {
2653	ret = ctdb_get_pnn(ctdb);
2654	} else {
2655	ret = -1;
2656	}
2657
2658	result.dsize = sizeof(int32_t);
2659	result.dptr = (uint8_t *)&ret;
2660
2661	srvid_requests_reply(ctdb, &current, result);
2662	}
2663
2664	/*
2665	* handler for assigning banning credits
2666	*/
2667	static void banning_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2668	{
2669	struct ctdb_recoverd *rec = talloc_get_type(
2670	private_data, struct ctdb_recoverd);
2671	uint32_t ban_pnn;
2672
2673	/* Ignore if we are not recmaster */
2674	if (rec->ctdb->pnn != rec->recmaster) {
2675	return;
2676	}
2677
2678	if (data.dsize != sizeof(uint32_t)) {
2679	DEBUG(DEBUG_ERR, (__location__ "invalid data size %zu\n",
2680	data.dsize));
2681	return;
2682	}
2683
2684	ban_pnn = (uint32_t )data.dptr;
2685
2686	ctdb_set_culprit_count(rec, ban_pnn, rec->nodemap->num);
2687	}
2688
2689	/*
2690	handler for recovery master elections
2691	*/
2692	static void election_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2693	{
2694	struct ctdb_recoverd *rec = talloc_get_type(
2695	private_data, struct ctdb_recoverd);
2696	struct ctdb_context *ctdb = rec->ctdb;
2697	int ret;
2698	struct election_message em = (struct election_message )data.dptr;
2699
2700	/* Ignore election packets from ourself */
2701	if (ctdb->pnn == em->pnn) {
2702	return;
2703	}
2704
2705	/* we got an election packet - update the timeout for the election */
2706	talloc_free(rec->election_timeout);
2707	rec->election_timeout = tevent_add_timer(
2708	ctdb->ev, ctdb,
2709	fast_start ?
2710	timeval_current_ofs(0, 500000) :
2711	timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2712	ctdb_election_timeout, rec);
2713
2714	/* someone called an election. check their election data
2715	and if we disagree and we would rather be the elected node,
2716	send a new election message to all other nodes
2717	*/
2718	if (ctdb_election_win(rec, em)) {
2719	if (!rec->send_election_te) {
2720	rec->send_election_te = tevent_add_timer(
2721	ctdb->ev, rec,
2722	timeval_current_ofs(0, 500000),
2723	election_send_request, rec);
2724	}
2725	return;
2726	}
2727
2728	/* we didn't win */
2729	TALLOC_FREE(rec->send_election_te);
2730
2731	/* Release the recovery lock file */
2732	if (ctdb_recovery_have_lock(ctdb)) {
2733	ctdb_recovery_unlock(ctdb);
2734	}
2735
2736	clear_ip_assignment_tree(ctdb);
2737
2738	/* ok, let that guy become recmaster then */
2739	ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(),
2740	CTDB_CURRENT_NODE, em->pnn);
2741	if (ret != 0) {
2742	DEBUG(DEBUG_ERR, (__location__ " failed to set recmaster"));
2743	return;
2744	}
2745	rec->recmaster = em->pnn;
2746
2747	return;
2748	}
2749
2750
2751	/*
2752	force the start of the election process
2753	*/
2754	static void force_election(struct ctdb_recoverd *rec, uint32_t pnn,
2755	struct ctdb_node_map_old *nodemap)
2756	{
2757	int ret;
2758	struct ctdb_context *ctdb = rec->ctdb;
2759
2760	DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
2761
2762	/* set all nodes to recovery mode to stop all internode traffic */
2763	ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE, false);
2764	if (ret != 0) {
2765	DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
2766	return;
2767	}
2768
2769	talloc_free(rec->election_timeout);
2770	rec->election_timeout = tevent_add_timer(
2771	ctdb->ev, ctdb,
2772	fast_start ?
2773	timeval_current_ofs(0, 500000) :
2774	timeval_current_ofs(ctdb->tunable.election_timeout, 0),
2775	ctdb_election_timeout, rec);
2776
2777	ret = send_election_request(rec, pnn);
2778	if (ret!=0) {
2779	DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
2780	return;
2781	}
2782
2783	/* wait for a few seconds to collect all responses */
2784	ctdb_wait_election(rec);
2785	}
2786
2787
2788
2789	/*
2790	handler for when a node changes its flags
2791	*/
2792	static void monitor_handler(uint64_t srvid, TDB_DATA data, void *private_data)
2793	{
2794	struct ctdb_recoverd *rec = talloc_get_type(
2795	private_data, struct ctdb_recoverd);
2796	struct ctdb_context *ctdb = rec->ctdb;
2797	int ret;
2798	struct ctdb_node_flag_change c = (struct ctdb_node_flag_change )data.dptr;
2799	struct ctdb_node_map_old *nodemap=NULL;
2800	TALLOC_CTX *tmp_ctx;
2801	int i;
2802	int disabled_flag_changed;
2803
2804	if (data.dsize != sizeof(*c)) {
2805	DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
2806	return;
2807	}
2808
2809	tmp_ctx = talloc_new(ctdb);
2810	CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
2811
2812	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
2813	if (ret != 0) {
2814	DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
2815	talloc_free(tmp_ctx);
2816	return;
2817	}
2818
2819
2820	for (i=0;i<nodemap->num;i++) {
2821	if (nodemap->nodes[i].pnn == c->pnn) break;
2822	}
2823
2824	if (i == nodemap->num) {
2825	DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
2826	talloc_free(tmp_ctx);
2827	return;
2828	}
2829
2830	if (c->old_flags != c->new_flags) {
2831	DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
2832	}
2833
2834	disabled_flag_changed = (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
2835
2836	nodemap->nodes[i].flags = c->new_flags;
2837
2838	ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(),
2839	CTDB_CURRENT_NODE, &ctdb->recovery_mode);
2840
2841	if (ret == 0 &&
2842	rec->recmaster == ctdb->pnn &&
2843	ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
2844	/* Only do the takeover run if the perm disabled or unhealthy
2845	flags changed since these will cause an ip failover but not
2846	a recovery.
2847	If the node became disconnected or banned this will also
2848	lead to an ip address failover but that is handled
2849	during recovery
2850	*/
2851	if (disabled_flag_changed) {
2852	rec->need_takeover_run = true;
2853	}
2854	}
2855
2856	talloc_free(tmp_ctx);
2857	}
2858
2859	/*
2860	handler for when we need to push out flag changes ot all other nodes
2861	*/
2862	static void push_flags_handler(uint64_t srvid, TDB_DATA data,
2863	void *private_data)
2864	{
2865	struct ctdb_recoverd *rec = talloc_get_type(
2866	private_data, struct ctdb_recoverd);
2867	struct ctdb_context *ctdb = rec->ctdb;
2868	int ret;
2869	struct ctdb_node_flag_change c = (struct ctdb_node_flag_change )data.dptr;
2870	struct ctdb_node_map_old *nodemap=NULL;
2871	TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
2872	uint32_t *nodes;
2873
2874	/* read the node flags from the recmaster */
2875	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
2876	tmp_ctx, &nodemap);
2877	if (ret != 0) {
2878	DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
2879	talloc_free(tmp_ctx);
2880	return;
2881	}
2882	if (c->pnn >= nodemap->num) {
2883	DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
2884	talloc_free(tmp_ctx);
2885	return;
2886	}
2887
2888	/* send the flags update to all connected nodes */
2889	nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
2890
2891	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
2892	nodes, 0, CONTROL_TIMEOUT(),
2893	false, data,
2894	NULL, NULL,
2895	NULL) != 0) {
2896	DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
2897
2898	talloc_free(tmp_ctx);
2899	return;
2900	}
2901
2902	talloc_free(tmp_ctx);
2903	}
2904
2905
2906	struct verify_recmode_normal_data {
2907	uint32_t count;
2908	enum monitor_result status;
2909	};
2910
2911	static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
2912	{
2913	struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
2914
2915
2916	/* one more node has responded with recmode data*/
2917	rmdata->count--;
2918
2919	/* if we failed to get the recmode, then return an error and let
2920	the main loop try again.
2921	*/
2922	if (state->state != CTDB_CONTROL_DONE) {
2923	if (rmdata->status == MONITOR_OK) {
2924	rmdata->status = MONITOR_FAILED;
2925	}
2926	return;
2927	}
2928
2929	/* if we got a response, then the recmode will be stored in the
2930	status field
2931	*/
2932	if (state->status != CTDB_RECOVERY_NORMAL) {
2933	DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
2934	rmdata->status = MONITOR_RECOVERY_NEEDED;
2935	}
2936
2937	return;
2938	}
2939
2940
2941	/* verify that all nodes are in normal recovery mode */
2942	static enum monitor_result verify_recmode(struct ctdb_context ctdb, struct ctdb_node_map_old nodemap)
2943	{
2944	struct verify_recmode_normal_data *rmdata;
2945	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
2946	struct ctdb_client_control_state *state;
2947	enum monitor_result status;
2948	int j;
2949
2950	rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
2951	CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
2952	rmdata->count = 0;
2953	rmdata->status = MONITOR_OK;
2954
2955	/* loop over all active nodes and send an async getrecmode call to
2956	them*/
2957	for (j=0; j<nodemap->num; j++) {
2958	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
2959	continue;
2960	}
2961	state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx,
2962	CONTROL_TIMEOUT(),
2963	nodemap->nodes[j].pnn);
2964	if (state == NULL) {
2965	/* we failed to send the control, treat this as
2966	an error and try again next iteration
2967	*/
2968	DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
2969	talloc_free(mem_ctx);
2970	return MONITOR_FAILED;
2971	}
2972
2973	/* set up the callback functions */
2974	state->async.fn = verify_recmode_normal_callback;
2975	state->async.private_data = rmdata;
2976
2977	/* one more control to wait for to complete */
2978	rmdata->count++;
2979	}
2980
2981
2982	/* now wait for up to the maximum number of seconds allowed
2983	or until all nodes we expect a response from has replied
2984	*/
2985	while (rmdata->count > 0) {
2986	tevent_loop_once(ctdb->ev);
2987	}
2988
2989	status = rmdata->status;
2990	talloc_free(mem_ctx);
2991	return status;
2992	}
2993
2994
2995	struct verify_recmaster_data {
2996	struct ctdb_recoverd *rec;
2997	uint32_t count;
2998	uint32_t pnn;
2999	enum monitor_result status;
3000	};
3001
3002	static void verify_recmaster_callback(struct ctdb_client_control_state *state)
3003	{
3004	struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
3005
3006
3007	/* one more node has responded with recmaster data*/
3008	rmdata->count--;
3009
3010	/* if we failed to get the recmaster, then return an error and let
3011	the main loop try again.
3012	*/
3013	if (state->state != CTDB_CONTROL_DONE) {
3014	if (rmdata->status == MONITOR_OK) {
3015	rmdata->status = MONITOR_FAILED;
3016	}
3017	return;
3018	}
3019
3020	/* if we got a response, then the recmaster will be stored in the
3021	status field
3022	*/
3023	if (state->status != rmdata->pnn) {
3024	DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
3025	ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
3026	rmdata->status = MONITOR_ELECTION_NEEDED;
3027	}
3028
3029	return;
3030	}
3031
3032
3033	/* verify that all nodes agree that we are the recmaster */
3034	static enum monitor_result verify_recmaster(struct ctdb_recoverd rec, struct ctdb_node_map_old nodemap, uint32_t pnn)
3035	{
3036	struct ctdb_context *ctdb = rec->ctdb;
3037	struct verify_recmaster_data *rmdata;
3038	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3039	struct ctdb_client_control_state *state;
3040	enum monitor_result status;
3041	int j;
3042
3043	rmdata = talloc(mem_ctx, struct verify_recmaster_data);
3044	CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
3045	rmdata->rec = rec;
3046	rmdata->count = 0;
3047	rmdata->pnn = pnn;
3048	rmdata->status = MONITOR_OK;
3049
3050	/* loop over all active nodes and send an async getrecmaster call to
3051	them*/
3052	for (j=0; j<nodemap->num; j++) {
3053	if (nodemap->nodes[j].pnn == rec->recmaster) {
3054	continue;
3055	}
3056	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3057	continue;
3058	}
3059	state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx,
3060	CONTROL_TIMEOUT(),
3061	nodemap->nodes[j].pnn);
3062	if (state == NULL) {
3063	/* we failed to send the control, treat this as
3064	an error and try again next iteration
3065	*/
3066	DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
3067	talloc_free(mem_ctx);
3068	return MONITOR_FAILED;
3069	}
3070
3071	/* set up the callback functions */
3072	state->async.fn = verify_recmaster_callback;
3073	state->async.private_data = rmdata;
3074
3075	/* one more control to wait for to complete */
3076	rmdata->count++;
3077	}
3078
3079
3080	/* now wait for up to the maximum number of seconds allowed
3081	or until all nodes we expect a response from has replied
3082	*/
3083	while (rmdata->count > 0) {
3084	tevent_loop_once(ctdb->ev);
3085	}
3086
3087	status = rmdata->status;
3088	talloc_free(mem_ctx);
3089	return status;
3090	}
3091
3092	static bool interfaces_have_changed(struct ctdb_context *ctdb,
3093	struct ctdb_recoverd *rec)
3094	{
3095	struct ctdb_iface_list_old *ifaces = NULL;
3096	TALLOC_CTX *mem_ctx;
3097	bool ret = false;
3098
3099	mem_ctx = talloc_new(NULL);
3100
3101	/* Read the interfaces from the local node */
3102	if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
3103	CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
3104	DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
3105	/* We could return an error. However, this will be
3106	* rare so we'll decide that the interfaces have
3107	* actually changed, just in case.
3108	*/
3109	talloc_free(mem_ctx);
3110	return true;
3111	}
3112
3113	if (!rec->ifaces) {
3114	/* We haven't been here before so things have changed */
3115	DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
3116	ret = true;
3117	} else if (rec->ifaces->num != ifaces->num) {
3118	/* Number of interfaces has changed */
3119	DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
3120	rec->ifaces->num, ifaces->num));
3121	ret = true;
3122	} else {
3123	/* See if interface names or link states have changed */
3124	int i;
3125	for (i = 0; i < rec->ifaces->num; i++) {
3126	struct ctdb_iface * iface = &rec->ifaces->ifaces[i];
3127	if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
3128	DEBUG(DEBUG_NOTICE,
3129	("Interface in slot %d changed: %s => %s\n",
3130	i, iface->name, ifaces->ifaces[i].name));
3131	ret = true;
3132	break;
3133	}
3134	if (iface->link_state != ifaces->ifaces[i].link_state) {
3135	DEBUG(DEBUG_NOTICE,
3136	("Interface %s changed state: %d => %d\n",
3137	iface->name, iface->link_state,
3138	ifaces->ifaces[i].link_state));
3139	ret = true;
3140	break;
3141	}
3142	}
3143	}
3144
3145	talloc_free(rec->ifaces);
3146	rec->ifaces = talloc_steal(rec, ifaces);
3147
3148	talloc_free(mem_ctx);
3149	return ret;
3150	}
3151
3152	/* called to check that the local allocation of public ip addresses is ok.
3153	*/
3154	static int verify_local_ip_allocation(struct ctdb_context ctdb, struct ctdb_recoverd rec, uint32_t pnn, struct ctdb_node_map_old *nodemap)
3155	{
3156	TALLOC_CTX *mem_ctx = talloc_new(NULL);
3157	int ret, j;
3158	bool need_takeover_run = false;
3159
3160	if (interfaces_have_changed(ctdb, rec)) {
3161	DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
3162	"local node %u - force takeover run\n",
3163	pnn));
3164	need_takeover_run = true;
3165	}
3166
3167	/* verify that we have the ip addresses we should have
3168	and we don't have ones we shouldnt have.
3169	if we find an inconsistency we set recmode to
3170	active on the local node and wait for the recmaster
3171	to do a full blown recovery.
3172	also if the pnn is -1 and we are healthy and can host the ip
3173	we also request a ip reallocation.
3174	*/
3175	if (ctdb->tunable.disable_ip_failover == 0) {
3176	struct ctdb_public_ip_list_old *ips = NULL;
3177
3178	/* read the available IPs from the local node */
3179	ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
3180	if (ret != 0) {
3181	DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
3182	talloc_free(mem_ctx);
3183	return -1;
3184	}
3185
3186	for (j=0; j<ips->num; j++) {
3187	if (ips->ips[j].pnn == -1 &&
3188	nodemap->nodes[pnn].flags == 0) {
3189	DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
3190	ctdb_addr_to_str(&ips->ips[j].addr)));
3191	need_takeover_run = true;
3192	}
3193	}
3194
3195	talloc_free(ips);
3196
3197	/* read the known IPs from the local node */
3198	ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
3199	if (ret != 0) {
3200	DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
3201	talloc_free(mem_ctx);
3202	return -1;
3203	}
3204
3205	for (j=0; j<ips->num; j++) {
3206	if (ips->ips[j].pnn == pnn) {
3207	if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
3208	DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
3209	ctdb_addr_to_str(&ips->ips[j].addr)));
3210	need_takeover_run = true;
3211	}
3212	} else {
3213	if (ctdb->do_checkpublicip &&
3214	ctdb_sys_have_ip(&ips->ips[j].addr)) {
3215
3216	DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n",
3217	ctdb_addr_to_str(&ips->ips[j].addr)));
3218
3219	if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
3220	DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
3221	}
3222	}
3223	}
3224	}
3225	}
3226
3227	if (need_takeover_run) {
3228	struct ctdb_srvid_message rd;
3229	TDB_DATA data;
3230
3231	DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
3232
3233	ZERO_STRUCT(rd);
3234	rd.pnn = ctdb->pnn;
3235	rd.srvid = 0;
3236	data.dptr = (uint8_t *)&rd;
3237	data.dsize = sizeof(rd);
3238
3239	ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
3240	if (ret != 0) {
3241	DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
3242	}
3243	}
3244	talloc_free(mem_ctx);
3245	return 0;
3246	}
3247
3248
3249	static void async_getnodemap_callback(struct ctdb_context ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void callback_data)
3250	{
3251	struct ctdb_node_map_old **remote_nodemaps = callback_data;
3252
3253	if (node_pnn >= ctdb->num_nodes) {
3254	DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
3255	return;
3256	}
3257
3258	remote_nodemaps[node_pnn] = (struct ctdb_node_map_old *)talloc_steal(remote_nodemaps, outdata.dptr);
3259
3260	}
3261
3262	static int get_remote_nodemaps(struct ctdb_context ctdb, TALLOC_CTX mem_ctx,
3263	struct ctdb_node_map_old *nodemap,
3264	struct ctdb_node_map_old **remote_nodemaps)
3265	{
3266	uint32_t *nodes;
3267
3268	nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
3269	if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
3270	nodes, 0,
3271	CONTROL_TIMEOUT(), false, tdb_null,
3272	async_getnodemap_callback,
3273	NULL,
3274	remote_nodemaps) != 0) {
3275	DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
3276
3277	return -1;
3278	}
3279
3280	return 0;
3281	}
3282
3283	static int update_recovery_lock_file(struct ctdb_context *ctdb)
3284	{
3285	TALLOC_CTX *tmp_ctx = talloc_new(NULL);
3286	const char *reclockfile;
3287
3288	if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
3289	DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
3290	talloc_free(tmp_ctx);
3291	return -1;
3292	}
3293
3294	if (reclockfile == NULL) {
3295	if (ctdb->recovery_lock_file != NULL) {
3296	DEBUG(DEBUG_NOTICE,("Recovery lock file disabled\n"));
3297	talloc_free(ctdb->recovery_lock_file);
3298	ctdb->recovery_lock_file = NULL;
3299	ctdb_recovery_unlock(ctdb);
3300	}
3301	talloc_free(tmp_ctx);
3302	return 0;
3303	}
3304
3305	if (ctdb->recovery_lock_file == NULL) {
3306	DEBUG(DEBUG_NOTICE,
3307	("Recovery lock file enabled (%s)\n", reclockfile));
3308	ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3309	ctdb_recovery_unlock(ctdb);
3310	talloc_free(tmp_ctx);
3311	return 0;
3312	}
3313
3314
3315	if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
3316	talloc_free(tmp_ctx);
3317	return 0;
3318	}
3319
3320	DEBUG(DEBUG_NOTICE,
3321	("Recovery lock file changed (now %s)\n", reclockfile));
3322	talloc_free(ctdb->recovery_lock_file);
3323	ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
3324	ctdb_recovery_unlock(ctdb);
3325
3326	talloc_free(tmp_ctx);
3327	return 0;
3328	}
3329
3330	static enum monitor_result validate_recovery_master(struct ctdb_recoverd *rec,
3331	TALLOC_CTX *mem_ctx)
3332	{
3333	struct ctdb_context *ctdb = rec->ctdb;
3334	uint32_t pnn = ctdb_get_pnn(ctdb);
3335	struct ctdb_node_map_old *nodemap = rec->nodemap;
3336	struct ctdb_node_map_old *recmaster_nodemap = NULL;
3337	int ret;
3338
3339	/* When recovery daemon is started, recmaster is set to
3340	* "unknown" so it knows to start an election.
3341	*/
3342	if (rec->recmaster == CTDB_UNKNOWN_PNN) {
3343	DEBUG(DEBUG_NOTICE,
3344	("Initial recovery master set - forcing election\n"));
3345	return MONITOR_ELECTION_NEEDED;
3346	}
3347
3348	/*
3349	* If the current recmaster does not have CTDB_CAP_RECMASTER,
3350	* but we have, then force an election and try to become the new
3351	* recmaster.
3352	*/
3353	if (!ctdb_node_has_capabilities(rec->caps,
3354	rec->recmaster,
3355	CTDB_CAP_RECMASTER) &&
3356	(rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
3357	!(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
3358	DEBUG(DEBUG_ERR,
3359	(" Current recmaster node %u does not have CAP_RECMASTER,"
3360	" but we (node %u) have - force an election\n",
3361	rec->recmaster, pnn));
3362	return MONITOR_ELECTION_NEEDED;
3363	}
3364
3365	/* Verify that the master node has not been deleted. This
3366	* should not happen because a node should always be shutdown
3367	* before being deleted, causing a new master to be elected
3368	* before now. However, if something strange has happened
3369	* then checking here will ensure we don't index beyond the
3370	* end of the nodemap array. */
3371	if (rec->recmaster >= nodemap->num) {
3372	DEBUG(DEBUG_ERR,
3373	("Recmaster node %u has been deleted. Force election\n",
3374	rec->recmaster));
3375	return MONITOR_ELECTION_NEEDED;
3376	}
3377
3378	/* if recovery master is disconnected/deleted we must elect a new recmaster */
3379	if (nodemap->nodes[rec->recmaster].flags &
3380	(NODE_FLAGS_DISCONNECTED\|NODE_FLAGS_DELETED)) {
3381	DEBUG(DEBUG_NOTICE,
3382	("Recmaster node %u is disconnected/deleted. Force election\n",
3383	rec->recmaster));
3384	return MONITOR_ELECTION_NEEDED;
3385	}
3386
3387	/* get nodemap from the recovery master to check if it is inactive */
3388	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), rec->recmaster,
3389	mem_ctx, &recmaster_nodemap);
3390	if (ret != 0) {
3391	DEBUG(DEBUG_ERR,
3392	(__location__
3393	" Unable to get nodemap from recovery master %u\n",
3394	rec->recmaster));
3395	return MONITOR_FAILED;
3396	}
3397
3398
3399	if ((recmaster_nodemap->nodes[rec->recmaster].flags & NODE_FLAGS_INACTIVE) &&
3400	(rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
3401	DEBUG(DEBUG_NOTICE,
3402	("Recmaster node %u is inactive. Force election\n",
3403	rec->recmaster));
3404	/*
3405	* update our nodemap to carry the recmaster's notion of
3406	* its own flags, so that we don't keep freezing the
3407	* inactive recmaster node...
3408	*/
3409	nodemap->nodes[rec->recmaster].flags =
3410	recmaster_nodemap->nodes[rec->recmaster].flags;
3411	return MONITOR_ELECTION_NEEDED;
3412	}
3413
3414	return MONITOR_OK;
3415	}
3416
3417	static void main_loop(struct ctdb_context ctdb, struct ctdb_recoverd rec,
3418	TALLOC_CTX *mem_ctx)
3419	{
3420	uint32_t pnn;
3421	struct ctdb_node_map_old *nodemap=NULL;
3422	struct ctdb_node_map_old **remote_nodemaps=NULL;
3423	struct ctdb_vnn_map *vnnmap=NULL;
3424	struct ctdb_vnn_map *remote_vnnmap=NULL;
3425	uint32_t num_lmasters;
3426	int32_t debug_level;
3427	int i, j, ret;
3428	bool self_ban;
3429
3430
3431	/* verify that the main daemon is still running */
3432	if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
3433	DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
3434	exit(-1);
3435	}
3436
3437	/* ping the local daemon to tell it we are alive */
3438	ctdb_ctrl_recd_ping(ctdb);
3439
3440	if (rec->election_timeout) {
3441	/* an election is in progress */
3442	return;
3443	}
3444
3445	/* read the debug level from the parent and update locally */
3446	ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
3447	if (ret !=0) {
3448	DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
3449	return;
3450	}
3451	DEBUGLEVEL = debug_level;
3452
3453	/* get relevant tunables */
3454	ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
3455	if (ret != 0) {
3456	DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
3457	return;
3458	}
3459
3460	/* get runstate */
3461	ret = ctdb_ctrl_get_runstate(ctdb, CONTROL_TIMEOUT(),
3462	CTDB_CURRENT_NODE, &ctdb->runstate);
3463	if (ret != 0) {
3464	DEBUG(DEBUG_ERR, ("Failed to get runstate - retrying\n"));
3465	return;
3466	}
3467
3468	/* get the current recovery lock file from the server */
3469	if (update_recovery_lock_file(ctdb) != 0) {
3470	DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
3471	return;
3472	}
3473
3474	pnn = ctdb_get_pnn(ctdb);
3475
3476	/* get nodemap */
3477	TALLOC_FREE(rec->nodemap);
3478	ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
3479	if (ret != 0) {
3480	DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
3481	return;
3482	}
3483	nodemap = rec->nodemap;
3484
3485	/* remember our own node flags */
3486	rec->node_flags = nodemap->nodes[pnn].flags;
3487
3488	ban_misbehaving_nodes(rec, &self_ban);
3489	if (self_ban) {
3490	DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
3491	return;
3492	}
3493
3494	/* if the local daemon is STOPPED or BANNED, we verify that the databases are
3495	also frozen and that the recmode is set to active.
3496	*/
3497	if (rec->node_flags & (NODE_FLAGS_STOPPED \| NODE_FLAGS_BANNED)) {
3498	/* If this node has become inactive then we want to
3499	* reduce the chances of it taking over the recovery
3500	* master role when it becomes active again. This
3501	* helps to stabilise the recovery master role so that
3502	* it stays on the most stable node.
3503	*/
3504	rec->priority_time = timeval_current();
3505
3506	ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
3507	if (ret != 0) {
3508	DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
3509	}
3510	if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
3511	DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
3512
3513	ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
3514	if (ret != 0) {
3515	DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
3516
3517	return;
3518	}
3519	}
3520	if (! rec->frozen_on_inactive) {
3521	ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(),
3522	CTDB_CURRENT_NODE);
3523	if (ret != 0) {
3524	DEBUG(DEBUG_ERR,
3525	(__location__ " Failed to freeze node "
3526	"in STOPPED or BANNED state\n"));
3527	return;
3528	}
3529
3530	rec->frozen_on_inactive = true;
3531	}
3532
3533	/* If this node is stopped or banned then it is not the recovery
3534	* master, so don't do anything. This prevents stopped or banned
3535	* node from starting election and sending unnecessary controls.
3536	*/
3537	return;
3538	}
3539
3540	rec->frozen_on_inactive = false;
3541
3542	/* If we are not the recmaster then do some housekeeping */
3543	if (rec->recmaster != pnn) {
3544	/* Ignore any IP reallocate requests - only recmaster
3545	* processes them
3546	*/
3547	TALLOC_FREE(rec->reallocate_requests);
3548	/* Clear any nodes that should be force rebalanced in
3549	* the next takeover run. If the recovery master role
3550	* has moved then we don't want to process these some
3551	* time in the future.
3552	*/
3553	TALLOC_FREE(rec->force_rebalance_nodes);
3554	}
3555
3556	/* Retrieve capabilities from all connected nodes */
3557	ret = update_capabilities(rec, nodemap);
3558	if (ret != 0) {
3559	DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
3560	return;
3561	}
3562
3563	switch (validate_recovery_master(rec, mem_ctx)) {
3564	case MONITOR_RECOVERY_NEEDED:
3565	/* can not happen */
3566	return;
3567	case MONITOR_ELECTION_NEEDED:
3568	force_election(rec, pnn, nodemap);
3569	return;
3570	case MONITOR_OK:
3571	break;
3572	case MONITOR_FAILED:
3573	return;
3574	}
3575
3576	/* verify that we have all ip addresses we should have and we dont
3577	* have addresses we shouldnt have.
3578	*/
3579	if (ctdb->tunable.disable_ip_failover == 0 &&
3580	!ctdb_op_is_disabled(rec->takeover_run)) {
3581	if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
3582	DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
3583	}
3584	}
3585
3586
3587	/* if we are not the recmaster then we do not need to check
3588	if recovery is needed
3589	*/
3590	if (pnn != rec->recmaster) {
3591	return;
3592	}
3593
3594
3595	/* ensure our local copies of flags are right */
3596	ret = update_local_flags(rec, nodemap);
3597	if (ret == MONITOR_ELECTION_NEEDED) {
3598	DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
3599	force_election(rec, pnn, nodemap);
3600	return;
3601	}
3602	if (ret != MONITOR_OK) {
3603	DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
3604	return;
3605	}
3606
3607	if (ctdb->num_nodes != nodemap->num) {
3608	DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
3609	ctdb_load_nodes_file(ctdb);
3610	return;
3611	}
3612
3613	/* verify that all active nodes agree that we are the recmaster */
3614	switch (verify_recmaster(rec, nodemap, pnn)) {
3615	case MONITOR_RECOVERY_NEEDED:
3616	/* can not happen */
3617	return;
3618	case MONITOR_ELECTION_NEEDED:
3619	force_election(rec, pnn, nodemap);
3620	return;
3621	case MONITOR_OK:
3622	break;
3623	case MONITOR_FAILED:
3624	return;
3625	}
3626
3627
3628	/* get the vnnmap */
3629	ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
3630	if (ret != 0) {
3631	DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
3632	return;
3633	}
3634
3635	if (rec->need_recovery) {
3636	/* a previous recovery didn't finish */
3637	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3638	return;
3639	}
3640
3641	/* verify that all active nodes are in normal mode
3642	and not in recovery mode
3643	*/
3644	switch (verify_recmode(ctdb, nodemap)) {
3645	case MONITOR_RECOVERY_NEEDED:
3646	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3647	return;
3648	case MONITOR_FAILED:
3649	return;
3650	case MONITOR_ELECTION_NEEDED:
3651	/* can not happen */
3652	case MONITOR_OK:
3653	break;
3654	}
3655
3656
3657	if (ctdb->recovery_lock_file != NULL) {
3658	/* We must already hold the recovery lock */
3659	if (!ctdb_recovery_have_lock(ctdb)) {
3660	DEBUG(DEBUG_ERR,("Failed recovery lock sanity check. Force a recovery\n"));
3661	ctdb_set_culprit(rec, ctdb->pnn);
3662	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3663	return;
3664	}
3665	}
3666
3667
3668	/* if there are takeovers requested, perform it and notify the waiters */
3669	if (!ctdb_op_is_disabled(rec->takeover_run) &&
3670	rec->reallocate_requests) {
3671	process_ipreallocate_requests(ctdb, rec);
3672	}
3673
3674	/* If recoveries are disabled then there is no use doing any
3675	* nodemap or flags checks. Recoveries might be disabled due
3676	* to "reloadnodes", so doing these checks might cause an
3677	* unnecessary recovery. */
3678	if (ctdb_op_is_disabled(rec->recovery)) {
3679	return;
3680	}
3681
3682	/* get the nodemap for all active remote nodes
3683	*/
3684	remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map_old *, nodemap->num);
3685	if (remote_nodemaps == NULL) {
3686	DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
3687	return;
3688	}
3689	for(i=0; i<nodemap->num; i++) {
3690	remote_nodemaps[i] = NULL;
3691	}
3692	if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
3693	DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
3694	return;
3695	}
3696
3697	/* verify that all other nodes have the same nodemap as we have
3698	*/
3699	for (j=0; j<nodemap->num; j++) {
3700	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3701	continue;
3702	}
3703
3704	if (remote_nodemaps[j] == NULL) {
3705	DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
3706	ctdb_set_culprit(rec, j);
3707
3708	return;
3709	}
3710
3711	/* if the nodes disagree on how many nodes there are
3712	then this is a good reason to try recovery
3713	*/
3714	if (remote_nodemaps[j]->num != nodemap->num) {
3715	DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
3716	nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
3717	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3718	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3719	return;
3720	}
3721
3722	/* if the nodes disagree on which nodes exist and are
3723	active, then that is also a good reason to do recovery
3724	*/
3725	for (i=0;i<nodemap->num;i++) {
3726	if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
3727	DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n",
3728	nodemap->nodes[j].pnn, i,
3729	remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
3730	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3731	do_recovery(rec, mem_ctx, pnn, nodemap,
3732	vnnmap);
3733	return;
3734	}
3735	}
3736	}
3737
3738	/*
3739	* Update node flags obtained from each active node. This ensure we have
3740	* up-to-date information for all the nodes.
3741	*/
3742	for (j=0; j<nodemap->num; j++) {
3743	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3744	continue;
3745	}
3746	nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
3747	}
3748
3749	for (j=0; j<nodemap->num; j++) {
3750	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3751	continue;
3752	}
3753
3754	/* verify the flags are consistent
3755	*/
3756	for (i=0; i<nodemap->num; i++) {
3757	if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
3758	continue;
3759	}
3760
3761	if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
3762	DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n",
3763	nodemap->nodes[j].pnn,
3764	nodemap->nodes[i].pnn,
3765	remote_nodemaps[j]->nodes[i].flags,
3766	nodemap->nodes[i].flags));
3767	if (i == j) {
3768	DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
3769	update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
3770	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3771	do_recovery(rec, mem_ctx, pnn, nodemap,
3772	vnnmap);
3773	return;
3774	} else {
3775	DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
3776	update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
3777	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3778	do_recovery(rec, mem_ctx, pnn, nodemap,
3779	vnnmap);
3780	return;
3781	}
3782	}
3783	}
3784	}
3785
3786
3787	/* count how many active nodes there are */
3788	num_lmasters = 0;
3789	for (i=0; i<nodemap->num; i++) {
3790	if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
3791	if (ctdb_node_has_capabilities(rec->caps,
3792	ctdb->nodes[i]->pnn,
3793	CTDB_CAP_LMASTER)) {
3794	num_lmasters++;
3795	}
3796	}
3797	}
3798
3799
3800	/* There must be the same number of lmasters in the vnn map as
3801	* there are active nodes with the lmaster capability... or
3802	* do a recovery.
3803	*/
3804	if (vnnmap->size != num_lmasters) {
3805	DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
3806	vnnmap->size, num_lmasters));
3807	ctdb_set_culprit(rec, ctdb->pnn);
3808	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3809	return;
3810	}
3811
3812	/* verify that all active nodes in the nodemap also exist in
3813	the vnnmap.
3814	*/
3815	for (j=0; j<nodemap->num; j++) {
3816	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3817	continue;
3818	}
3819	if (nodemap->nodes[j].pnn == pnn) {
3820	continue;
3821	}
3822
3823	for (i=0; i<vnnmap->size; i++) {
3824	if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
3825	break;
3826	}
3827	}
3828	if (i == vnnmap->size) {
3829	DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n",
3830	nodemap->nodes[j].pnn));
3831	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3832	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3833	return;
3834	}
3835	}
3836
3837
3838	/* verify that all other nodes have the same vnnmap
3839	and are from the same generation
3840	*/
3841	for (j=0; j<nodemap->num; j++) {
3842	if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
3843	continue;
3844	}
3845	if (nodemap->nodes[j].pnn == pnn) {
3846	continue;
3847	}
3848
3849	ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn,
3850	mem_ctx, &remote_vnnmap);
3851	if (ret != 0) {
3852	DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n",
3853	nodemap->nodes[j].pnn));
3854	return;
3855	}
3856
3857	/* verify the vnnmap generation is the same */
3858	if (vnnmap->generation != remote_vnnmap->generation) {
3859	DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n",
3860	nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
3861	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3862	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3863	return;
3864	}
3865
3866	/* verify the vnnmap size is the same */
3867	if (vnnmap->size != remote_vnnmap->size) {
3868	DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n",
3869	nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
3870	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3871	do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
3872	return;
3873	}
3874
3875	/* verify the vnnmap is the same */
3876	for (i=0;i<vnnmap->size;i++) {
3877	if (remote_vnnmap->map[i] != vnnmap->map[i]) {
3878	DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n",
3879	nodemap->nodes[j].pnn));
3880	ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
3881	do_recovery(rec, mem_ctx, pnn, nodemap,
3882	vnnmap);
3883	return;
3884	}
3885	}
3886	}
3887
3888	/* we might need to change who has what IP assigned */
3889	if (rec->need_takeover_run) {
3890	/* If takeover run fails, then the offending nodes are
3891	* assigned ban culprit counts. And we re-try takeover.
3892	* If takeover run fails repeatedly, the node would get
3893	* banned.
3894	*/
3895	do_takeover_run(rec, nodemap, true);
3896	}
3897	}
3898
3899	/*
3900	the main monitoring loop
3901	*/
3902	static void monitor_cluster(struct ctdb_context *ctdb)
3903	{
3904	struct ctdb_recoverd *rec;
3905
3906	DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
3907
3908	rec = talloc_zero(ctdb, struct ctdb_recoverd);
3909	CTDB_NO_MEMORY_FATAL(ctdb, rec);
3910
3911	rec->ctdb = ctdb;
3912	rec->recmaster = CTDB_UNKNOWN_PNN;
3913
3914	rec->takeover_run = ctdb_op_init(rec, "takeover runs");
3915	CTDB_NO_MEMORY_FATAL(ctdb, rec->takeover_run);
3916
3917	rec->recovery = ctdb_op_init(rec, "recoveries");
3918	CTDB_NO_MEMORY_FATAL(ctdb, rec->recovery);
3919
3920	rec->priority_time = timeval_current();
3921	rec->frozen_on_inactive = false;
3922
3923	/* register a message port for sending memory dumps */
3924	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
3925
3926	/* when a node is assigned banning credits */
3927	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_BANNING,
3928	banning_handler, rec);
3929
3930	/* register a message port for recovery elections */
3931	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_ELECTION, election_handler, rec);
3932
3933	/* when nodes are disabled/enabled */
3934	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
3935
3936	/* when we are asked to puch out a flag change */
3937	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
3938
3939	/* register a message port for vacuum fetch */
3940	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
3941
3942	/* register a message port for reloadnodes */
3943	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
3944
3945	/* register a message port for performing a takeover run */
3946	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
3947
3948	/* register a message port for disabling the ip check for a short while */
3949	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
3950
3951	/* register a message port for updating the recovery daemons node assignment for an ip */
3952	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
3953
3954	/* register a message port for forcing a rebalance of a node next
3955	reallocation */
3956	ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
3957
3958	/* Register a message port for disabling takeover runs */
3959	ctdb_client_set_message_handler(ctdb,
3960	CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
3961	disable_takeover_runs_handler, rec);
3962
3963	/* Register a message port for disabling recoveries */
3964	ctdb_client_set_message_handler(ctdb,
3965	CTDB_SRVID_DISABLE_RECOVERIES,
3966	disable_recoveries_handler, rec);
3967
3968	/* register a message port for detaching database */
3969	ctdb_client_set_message_handler(ctdb,
3970	CTDB_SRVID_DETACH_DATABASE,
3971	detach_database_handler, rec);
3972
3973	for (;;) {
3974	TALLOC_CTX *mem_ctx = talloc_new(ctdb);
3975	struct timeval start;
3976	double elapsed;
3977
3978	if (!mem_ctx) {
3979	DEBUG(DEBUG_CRIT,(__location__
3980	" Failed to create temp context\n"));
3981	exit(-1);
3982	}
3983
3984	start = timeval_current();
3985	main_loop(ctdb, rec, mem_ctx);
3986	talloc_free(mem_ctx);
3987
3988	/* we only check for recovery once every second */
3989	elapsed = timeval_elapsed(&start);
3990	if (elapsed < ctdb->tunable.recover_interval) {
3991	ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
3992	- elapsed);
3993	}
3994	}
3995	}
3996
3997	/*
3998	event handler for when the main ctdbd dies
3999	*/
4000	static void ctdb_recoverd_parent(struct tevent_context *ev,
4001	struct tevent_fd *fde,
4002	uint16_t flags, void *private_data)
4003	{
4004	DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
4005	_exit(1);
4006	}
4007
4008	/*
4009	called regularly to verify that the recovery daemon is still running
4010	*/
4011	static void ctdb_check_recd(struct tevent_context *ev,
4012	struct tevent_timer *te,
4013	struct timeval yt, void *p)
4014	{
4015	struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
4016
4017	if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
4018	DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
4019
4020	tevent_add_timer(ctdb->ev, ctdb, timeval_zero(),
4021	ctdb_restart_recd, ctdb);
4022
4023	return;
4024	}
4025
4026	tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4027	timeval_current_ofs(30, 0),
4028	ctdb_check_recd, ctdb);
4029	}
4030
4031	static void recd_sig_child_handler(struct tevent_context *ev,
4032	struct tevent_signal *se, int signum,
4033	int count, void *dont_care,
4034	void *private_data)
4035	{
4036	// struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4037	int status;
4038	pid_t pid = -1;
4039
4040	while (pid != 0) {
4041	pid = waitpid(-1, &status, WNOHANG);
4042	if (pid == -1) {
4043	if (errno != ECHILD) {
4044	DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
4045	}
4046	return;
4047	}
4048	if (pid > 0) {
4049	DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
4050	}
4051	}
4052	}
4053
4054	/*
4055	startup the recovery daemon as a child of the main ctdb daemon
4056	*/
4057	int ctdb_start_recoverd(struct ctdb_context *ctdb)
4058	{
4059	int fd[2];
4060	struct tevent_signal *se;
4061	struct tevent_fd *fde;
4062
4063	if (pipe(fd) != 0) {
4064	return -1;
4065	}
4066
4067	ctdb->recoverd_pid = ctdb_fork(ctdb);
4068	if (ctdb->recoverd_pid == -1) {
4069	return -1;
4070	}
4071
4072	if (ctdb->recoverd_pid != 0) {
4073	talloc_free(ctdb->recd_ctx);
4074	ctdb->recd_ctx = talloc_new(ctdb);
4075	CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
4076
4077	close(fd[0]);
4078	tevent_add_timer(ctdb->ev, ctdb->recd_ctx,
4079	timeval_current_ofs(30, 0),
4080	ctdb_check_recd, ctdb);
4081	return 0;
4082	}
4083
4084	close(fd[1]);
4085
4086	srandom(getpid() ^ time(NULL));
4087
4088	prctl_set_comment("ctdb_recovered");
4089	if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
4090	DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
4091	exit(1);
4092	}
4093
4094	DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
4095
4096	fde = tevent_add_fd(ctdb->ev, ctdb, fd[0], TEVENT_FD_READ,
4097	ctdb_recoverd_parent, &fd[0]);
4098	tevent_fd_set_auto_close(fde);
4099
4100	/* set up a handler to pick up sigchld */
4101	se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0,
4102	recd_sig_child_handler, ctdb);
4103	if (se == NULL) {
4104	DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
4105	exit(1);
4106	}
4107
4108	monitor_cluster(ctdb);
4109
4110	DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
4111	return -1;
4112	}
4113
4114	/*
4115	shutdown the recovery daemon
4116	*/
4117	void ctdb_stop_recoverd(struct ctdb_context *ctdb)
4118	{
4119	if (ctdb->recoverd_pid == 0) {
4120	return;
4121	}
4122
4123	DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
4124	ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
4125
4126	TALLOC_FREE(ctdb->recd_ctx);
4127	TALLOC_FREE(ctdb->recd_ping_count);
4128	}
4129
4130	static void ctdb_restart_recd(struct tevent_context *ev,
4131	struct tevent_timer *te,
4132	struct timeval t, void *private_data)
4133	{
4134	struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
4135
4136	DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
4137	ctdb_stop_recoverd(ctdb);
4138	ctdb_start_recoverd(ctdb);
4139	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: vendor/current/ctdb/server/ctdb_recoverd.c

Download in other formats: