source: vendor/current/ctdb/server/ctdb_takeover.c

Last change on this file was 989, checked in by Silvan Scherrer, 9 years ago

Samba Server: update vendor to version 4.4.7

File size: 98.5 KB
Line 
1/*
2 ctdb ip takeover code
3
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
6 Copyright (C) Martin Schwenke 2011
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, see <http://www.gnu.org/licenses/>.
20*/
21#include "replace.h"
22#include "system/network.h"
23#include "system/filesys.h"
24#include "system/time.h"
25#include "system/wait.h"
26
27#include <talloc.h>
28#include <tevent.h>
29
30#include "lib/util/dlinklist.h"
31#include "lib/util/debug.h"
32#include "lib/util/samba_util.h"
33#include "lib/util/util_process.h"
34
35#include "ctdb_private.h"
36#include "ctdb_client.h"
37
38#include "common/rb_tree.h"
39#include "common/reqid.h"
40#include "common/system.h"
41#include "common/common.h"
42#include "common/logging.h"
43
44#include "server/ipalloc.h"
45
46#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
47
48#define CTDB_ARP_INTERVAL 1
49#define CTDB_ARP_REPEAT 3
50
51struct ctdb_interface {
52 struct ctdb_interface *prev, *next;
53 const char *name;
54 bool link_up;
55 uint32_t references;
56};
57
58static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
59{
60 if (vnn->iface) {
61 return vnn->iface->name;
62 }
63
64 return "__none__";
65}
66
67static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
68{
69 struct ctdb_interface *i;
70
71 /* Verify that we don't have an entry for this ip yet */
72 for (i=ctdb->ifaces;i;i=i->next) {
73 if (strcmp(i->name, iface) == 0) {
74 return 0;
75 }
76 }
77
78 /* create a new structure for this interface */
79 i = talloc_zero(ctdb, struct ctdb_interface);
80 CTDB_NO_MEMORY_FATAL(ctdb, i);
81 i->name = talloc_strdup(i, iface);
82 CTDB_NO_MEMORY(ctdb, i->name);
83
84 i->link_up = true;
85
86 DLIST_ADD(ctdb->ifaces, i);
87
88 return 0;
89}
90
91static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
92 const char *name)
93{
94 int n;
95
96 for (n = 0; vnn->ifaces[n] != NULL; n++) {
97 if (strcmp(name, vnn->ifaces[n]) == 0) {
98 return true;
99 }
100 }
101
102 return false;
103}
104
105/* If any interfaces now have no possible IPs then delete them. This
106 * implementation is naive (i.e. simple) rather than clever
107 * (i.e. complex). Given that this is run on delip and that operation
108 * is rare, this doesn't need to be efficient - it needs to be
109 * foolproof. One alternative is reference counting, where the logic
110 * is distributed and can, therefore, be broken in multiple places.
111 * Another alternative is to build a red-black tree of interfaces that
112 * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
113 * once) and then walking ctdb->ifaces once and deleting those not in
114 * the tree. Let's go to one of those if the naive implementation
115 * causes problems... :-)
116 */
117static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
118 struct ctdb_vnn *vnn)
119{
120 struct ctdb_interface *i, *next;
121
122 /* For each interface, check if there's an IP using it. */
123 for (i = ctdb->ifaces; i != NULL; i = next) {
124 struct ctdb_vnn *tv;
125 bool found;
126 next = i->next;
127
128 /* Only consider interfaces named in the given VNN. */
129 if (!vnn_has_interface_with_name(vnn, i->name)) {
130 continue;
131 }
132
133 /* Is the "single IP" on this interface? */
134 if ((ctdb->single_ip_vnn != NULL) &&
135 (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
136 (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
137 /* Found, next interface please... */
138 continue;
139 }
140 /* Search for a vnn with this interface. */
141 found = false;
142 for (tv=ctdb->vnn; tv; tv=tv->next) {
143 if (vnn_has_interface_with_name(tv, i->name)) {
144 found = true;
145 break;
146 }
147 }
148
149 if (!found) {
150 /* None of the VNNs are using this interface. */
151 DLIST_REMOVE(ctdb->ifaces, i);
152 talloc_free(i);
153 }
154 }
155}
156
157
158static struct ctdb_interface *ctdb_find_iface(struct ctdb_context *ctdb,
159 const char *iface)
160{
161 struct ctdb_interface *i;
162
163 for (i=ctdb->ifaces;i;i=i->next) {
164 if (strcmp(i->name, iface) == 0) {
165 return i;
166 }
167 }
168
169 return NULL;
170}
171
172static struct ctdb_interface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
173 struct ctdb_vnn *vnn)
174{
175 int i;
176 struct ctdb_interface *cur = NULL;
177 struct ctdb_interface *best = NULL;
178
179 for (i=0; vnn->ifaces[i]; i++) {
180
181 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
182 if (cur == NULL) {
183 continue;
184 }
185
186 if (!cur->link_up) {
187 continue;
188 }
189
190 if (best == NULL) {
191 best = cur;
192 continue;
193 }
194
195 if (cur->references < best->references) {
196 best = cur;
197 continue;
198 }
199 }
200
201 return best;
202}
203
204static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
205 struct ctdb_vnn *vnn)
206{
207 struct ctdb_interface *best = NULL;
208
209 if (vnn->iface) {
210 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
211 "still assigned to iface '%s'\n",
212 ctdb_addr_to_str(&vnn->public_address),
213 ctdb_vnn_iface_string(vnn)));
214 return 0;
215 }
216
217 best = ctdb_vnn_best_iface(ctdb, vnn);
218 if (best == NULL) {
219 DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
220 "cannot assign to iface any iface\n",
221 ctdb_addr_to_str(&vnn->public_address)));
222 return -1;
223 }
224
225 vnn->iface = best;
226 best->references++;
227 vnn->pnn = ctdb->pnn;
228
229 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
230 "now assigned to iface '%s' refs[%d]\n",
231 ctdb_addr_to_str(&vnn->public_address),
232 ctdb_vnn_iface_string(vnn),
233 best->references));
234 return 0;
235}
236
237static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
238 struct ctdb_vnn *vnn)
239{
240 DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
241 "now unassigned (old iface '%s' refs[%d])\n",
242 ctdb_addr_to_str(&vnn->public_address),
243 ctdb_vnn_iface_string(vnn),
244 vnn->iface?vnn->iface->references:0));
245 if (vnn->iface) {
246 vnn->iface->references--;
247 }
248 vnn->iface = NULL;
249 if (vnn->pnn == ctdb->pnn) {
250 vnn->pnn = -1;
251 }
252}
253
254static bool ctdb_vnn_available(struct ctdb_context *ctdb,
255 struct ctdb_vnn *vnn)
256{
257 int i;
258
259 /* Nodes that are not RUNNING can not host IPs */
260 if (ctdb->runstate != CTDB_RUNSTATE_RUNNING) {
261 return false;
262 }
263
264 if (vnn->delete_pending) {
265 return false;
266 }
267
268 if (vnn->iface && vnn->iface->link_up) {
269 return true;
270 }
271
272 for (i=0; vnn->ifaces[i]; i++) {
273 struct ctdb_interface *cur;
274
275 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
276 if (cur == NULL) {
277 continue;
278 }
279
280 if (cur->link_up) {
281 return true;
282 }
283 }
284
285 return false;
286}
287
288struct ctdb_takeover_arp {
289 struct ctdb_context *ctdb;
290 uint32_t count;
291 ctdb_sock_addr addr;
292 struct ctdb_tcp_array *tcparray;
293 struct ctdb_vnn *vnn;
294};
295
296
297/*
298 lists of tcp endpoints
299 */
300struct ctdb_tcp_list {
301 struct ctdb_tcp_list *prev, *next;
302 struct ctdb_connection connection;
303};
304
305/*
306 list of clients to kill on IP release
307 */
308struct ctdb_client_ip {
309 struct ctdb_client_ip *prev, *next;
310 struct ctdb_context *ctdb;
311 ctdb_sock_addr addr;
312 uint32_t client_id;
313};
314
315
316/*
317 send a gratuitous arp
318 */
319static void ctdb_control_send_arp(struct tevent_context *ev,
320 struct tevent_timer *te,
321 struct timeval t, void *private_data)
322{
323 struct ctdb_takeover_arp *arp = talloc_get_type(private_data,
324 struct ctdb_takeover_arp);
325 int i, ret;
326 struct ctdb_tcp_array *tcparray;
327 const char *iface = ctdb_vnn_iface_string(arp->vnn);
328
329 ret = ctdb_sys_send_arp(&arp->addr, iface);
330 if (ret != 0) {
331 DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
332 iface, strerror(errno)));
333 }
334
335 tcparray = arp->tcparray;
336 if (tcparray) {
337 for (i=0;i<tcparray->num;i++) {
338 struct ctdb_connection *tcon;
339
340 tcon = &tcparray->connections[i];
341 DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
342 (unsigned)ntohs(tcon->dst.ip.sin_port),
343 ctdb_addr_to_str(&tcon->src),
344 (unsigned)ntohs(tcon->src.ip.sin_port)));
345 ret = ctdb_sys_send_tcp(
346 &tcon->src,
347 &tcon->dst,
348 0, 0, 0);
349 if (ret != 0) {
350 DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
351 ctdb_addr_to_str(&tcon->src)));
352 }
353 }
354 }
355
356 arp->count++;
357
358 if (arp->count == CTDB_ARP_REPEAT) {
359 talloc_free(arp);
360 return;
361 }
362
363 tevent_add_timer(arp->ctdb->ev, arp->vnn->takeover_ctx,
364 timeval_current_ofs(CTDB_ARP_INTERVAL, 100000),
365 ctdb_control_send_arp, arp);
366}
367
368static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
369 struct ctdb_vnn *vnn)
370{
371 struct ctdb_takeover_arp *arp;
372 struct ctdb_tcp_array *tcparray;
373
374 if (!vnn->takeover_ctx) {
375 vnn->takeover_ctx = talloc_new(vnn);
376 if (!vnn->takeover_ctx) {
377 return -1;
378 }
379 }
380
381 arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
382 if (!arp) {
383 return -1;
384 }
385
386 arp->ctdb = ctdb;
387 arp->addr = vnn->public_address;
388 arp->vnn = vnn;
389
390 tcparray = vnn->tcp_array;
391 if (tcparray) {
392 /* add all of the known tcp connections for this IP to the
393 list of tcp connections to send tickle acks for */
394 arp->tcparray = talloc_steal(arp, tcparray);
395
396 vnn->tcp_array = NULL;
397 vnn->tcp_update_needed = true;
398 }
399
400 tevent_add_timer(arp->ctdb->ev, vnn->takeover_ctx,
401 timeval_zero(), ctdb_control_send_arp, arp);
402
403 return 0;
404}
405
406struct ctdb_do_takeip_state {
407 struct ctdb_req_control_old *c;
408 struct ctdb_vnn *vnn;
409};
410
411/*
412 called when takeip event finishes
413 */
414static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
415 void *private_data)
416{
417 struct ctdb_do_takeip_state *state =
418 talloc_get_type(private_data, struct ctdb_do_takeip_state);
419 int32_t ret;
420 TDB_DATA data;
421
422 if (status != 0) {
423 struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
424
425 if (status == -ETIME) {
426 ctdb_ban_self(ctdb);
427 }
428 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
429 ctdb_addr_to_str(&state->vnn->public_address),
430 ctdb_vnn_iface_string(state->vnn)));
431 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
432
433 node->flags |= NODE_FLAGS_UNHEALTHY;
434 talloc_free(state);
435 return;
436 }
437
438 if (ctdb->do_checkpublicip) {
439
440 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
441 if (ret != 0) {
442 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
443 talloc_free(state);
444 return;
445 }
446
447 }
448
449 data.dptr = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
450 data.dsize = strlen((char *)data.dptr) + 1;
451 DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
452
453 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
454
455
456 /* the control succeeded */
457 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
458 talloc_free(state);
459 return;
460}
461
462static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
463{
464 state->vnn->update_in_flight = false;
465 return 0;
466}
467
468/*
469 take over an ip address
470 */
471static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
472 struct ctdb_req_control_old *c,
473 struct ctdb_vnn *vnn)
474{
475 int ret;
476 struct ctdb_do_takeip_state *state;
477
478 if (vnn->update_in_flight) {
479 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
480 "update for this IP already in flight\n",
481 ctdb_addr_to_str(&vnn->public_address),
482 vnn->public_netmask_bits));
483 return -1;
484 }
485
486 ret = ctdb_vnn_assign_iface(ctdb, vnn);
487 if (ret != 0) {
488 DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
489 "assign a usable interface\n",
490 ctdb_addr_to_str(&vnn->public_address),
491 vnn->public_netmask_bits));
492 return -1;
493 }
494
495 state = talloc(vnn, struct ctdb_do_takeip_state);
496 CTDB_NO_MEMORY(ctdb, state);
497
498 state->c = NULL;
499 state->vnn = vnn;
500
501 vnn->update_in_flight = true;
502 talloc_set_destructor(state, ctdb_takeip_destructor);
503
504 DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
505 ctdb_addr_to_str(&vnn->public_address),
506 vnn->public_netmask_bits,
507 ctdb_vnn_iface_string(vnn)));
508
509 ret = ctdb_event_script_callback(ctdb,
510 state,
511 ctdb_do_takeip_callback,
512 state,
513 CTDB_EVENT_TAKE_IP,
514 "%s %s %u",
515 ctdb_vnn_iface_string(vnn),
516 ctdb_addr_to_str(&vnn->public_address),
517 vnn->public_netmask_bits);
518
519 if (ret != 0) {
520 DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
521 ctdb_addr_to_str(&vnn->public_address),
522 ctdb_vnn_iface_string(vnn)));
523 talloc_free(state);
524 return -1;
525 }
526
527 state->c = talloc_steal(ctdb, c);
528 return 0;
529}
530
531struct ctdb_do_updateip_state {
532 struct ctdb_req_control_old *c;
533 struct ctdb_interface *old;
534 struct ctdb_vnn *vnn;
535};
536
537/*
538 called when updateip event finishes
539 */
540static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
541 void *private_data)
542{
543 struct ctdb_do_updateip_state *state =
544 talloc_get_type(private_data, struct ctdb_do_updateip_state);
545 int32_t ret;
546
547 if (status != 0) {
548 if (status == -ETIME) {
549 ctdb_ban_self(ctdb);
550 }
551 DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
552 ctdb_addr_to_str(&state->vnn->public_address),
553 state->old->name,
554 ctdb_vnn_iface_string(state->vnn)));
555
556 /*
557 * All we can do is reset the old interface
558 * and let the next run fix it
559 */
560 ctdb_vnn_unassign_iface(ctdb, state->vnn);
561 state->vnn->iface = state->old;
562 state->vnn->iface->references++;
563
564 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
565 talloc_free(state);
566 return;
567 }
568
569 if (ctdb->do_checkpublicip) {
570
571 ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
572 if (ret != 0) {
573 ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
574 talloc_free(state);
575 return;
576 }
577
578 }
579
580 /* the control succeeded */
581 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
582 talloc_free(state);
583 return;
584}
585
586static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
587{
588 state->vnn->update_in_flight = false;
589 return 0;
590}
591
592/*
593 update (move) an ip address
594 */
595static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
596 struct ctdb_req_control_old *c,
597 struct ctdb_vnn *vnn)
598{
599 int ret;
600 struct ctdb_do_updateip_state *state;
601 struct ctdb_interface *old = vnn->iface;
602 const char *new_name;
603
604 if (vnn->update_in_flight) {
605 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
606 "update for this IP already in flight\n",
607 ctdb_addr_to_str(&vnn->public_address),
608 vnn->public_netmask_bits));
609 return -1;
610 }
611
612 ctdb_vnn_unassign_iface(ctdb, vnn);
613 ret = ctdb_vnn_assign_iface(ctdb, vnn);
614 if (ret != 0) {
615 DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
616 "assin a usable interface (old iface '%s')\n",
617 ctdb_addr_to_str(&vnn->public_address),
618 vnn->public_netmask_bits,
619 old->name));
620 return -1;
621 }
622
623 new_name = ctdb_vnn_iface_string(vnn);
624 if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
625 /* A benign update from one interface onto itself.
626 * no need to run the eventscripts in this case, just return
627 * success.
628 */
629 ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
630 return 0;
631 }
632
633 state = talloc(vnn, struct ctdb_do_updateip_state);
634 CTDB_NO_MEMORY(ctdb, state);
635
636 state->c = NULL;
637 state->old = old;
638 state->vnn = vnn;
639
640 vnn->update_in_flight = true;
641 talloc_set_destructor(state, ctdb_updateip_destructor);
642
643 DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
644 "interface %s to %s\n",
645 ctdb_addr_to_str(&vnn->public_address),
646 vnn->public_netmask_bits,
647 old->name,
648 new_name));
649
650 ret = ctdb_event_script_callback(ctdb,
651 state,
652 ctdb_do_updateip_callback,
653 state,
654 CTDB_EVENT_UPDATE_IP,
655 "%s %s %s %u",
656 state->old->name,
657 new_name,
658 ctdb_addr_to_str(&vnn->public_address),
659 vnn->public_netmask_bits);
660 if (ret != 0) {
661 DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
662 ctdb_addr_to_str(&vnn->public_address),
663 old->name, new_name));
664 talloc_free(state);
665 return -1;
666 }
667
668 state->c = talloc_steal(ctdb, c);
669 return 0;
670}
671
672/*
673 Find the vnn of the node that has a public ip address
674 returns -1 if the address is not known as a public address
675 */
676static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
677{
678 struct ctdb_vnn *vnn;
679
680 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
681 if (ctdb_same_ip(&vnn->public_address, addr)) {
682 return vnn;
683 }
684 }
685
686 return NULL;
687}
688
689/*
690 take over an ip address
691 */
692int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
693 struct ctdb_req_control_old *c,
694 TDB_DATA indata,
695 bool *async_reply)
696{
697 int ret;
698 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
699 struct ctdb_vnn *vnn;
700 bool have_ip = false;
701 bool do_updateip = false;
702 bool do_takeip = false;
703 struct ctdb_interface *best_iface = NULL;
704
705 if (pip->pnn != ctdb->pnn) {
706 DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
707 "with pnn %d, but we're node %d\n",
708 ctdb_addr_to_str(&pip->addr),
709 pip->pnn, ctdb->pnn));
710 return -1;
711 }
712
713 /* update out vnn list */
714 vnn = find_public_ip_vnn(ctdb, &pip->addr);
715 if (vnn == NULL) {
716 DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
717 ctdb_addr_to_str(&pip->addr)));
718 return 0;
719 }
720
721 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
722 have_ip = ctdb_sys_have_ip(&pip->addr);
723 }
724 best_iface = ctdb_vnn_best_iface(ctdb, vnn);
725 if (best_iface == NULL) {
726 DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
727 "a usable interface (old %s, have_ip %d)\n",
728 ctdb_addr_to_str(&vnn->public_address),
729 vnn->public_netmask_bits,
730 ctdb_vnn_iface_string(vnn),
731 have_ip));
732 return -1;
733 }
734
735 if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
736 DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
737 have_ip = false;
738 }
739
740
741 if (vnn->iface == NULL && have_ip) {
742 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
743 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
744 ctdb_addr_to_str(&vnn->public_address)));
745 return 0;
746 }
747
748 if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
749 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
750 "and we have it on iface[%s], but it was assigned to node %d"
751 "and we are node %d, banning ourself\n",
752 ctdb_addr_to_str(&vnn->public_address),
753 ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
754 ctdb_ban_self(ctdb);
755 return -1;
756 }
757
758 if (vnn->pnn == -1 && have_ip) {
759 vnn->pnn = ctdb->pnn;
760 DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
761 "and we already have it on iface[%s], update local daemon\n",
762 ctdb_addr_to_str(&vnn->public_address),
763 ctdb_vnn_iface_string(vnn)));
764 return 0;
765 }
766
767 if (vnn->iface) {
768 if (vnn->iface != best_iface) {
769 if (!vnn->iface->link_up) {
770 do_updateip = true;
771 } else if (vnn->iface->references > (best_iface->references + 1)) {
772 /* only move when the rebalance gains something */
773 do_updateip = true;
774 }
775 }
776 }
777
778 if (!have_ip) {
779 if (do_updateip) {
780 ctdb_vnn_unassign_iface(ctdb, vnn);
781 do_updateip = false;
782 }
783 do_takeip = true;
784 }
785
786 if (do_takeip) {
787 ret = ctdb_do_takeip(ctdb, c, vnn);
788 if (ret != 0) {
789 return -1;
790 }
791 } else if (do_updateip) {
792 ret = ctdb_do_updateip(ctdb, c, vnn);
793 if (ret != 0) {
794 return -1;
795 }
796 } else {
797 /*
798 * The interface is up and the kernel known the ip
799 * => do nothing
800 */
801 DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
802 ctdb_addr_to_str(&pip->addr),
803 vnn->public_netmask_bits,
804 ctdb_vnn_iface_string(vnn)));
805 return 0;
806 }
807
808 /* tell ctdb_control.c that we will be replying asynchronously */
809 *async_reply = true;
810
811 return 0;
812}
813
814static void do_delete_ip(struct ctdb_context *ctdb, struct ctdb_vnn *vnn)
815{
816 DLIST_REMOVE(ctdb->vnn, vnn);
817 ctdb_vnn_unassign_iface(ctdb, vnn);
818 ctdb_remove_orphaned_ifaces(ctdb, vnn);
819 talloc_free(vnn);
820}
821
822static struct ctdb_vnn *release_ip_post(struct ctdb_context *ctdb,
823 struct ctdb_vnn *vnn,
824 ctdb_sock_addr *addr)
825{
826 TDB_DATA data;
827
828 /* Send a message to all clients of this node telling them
829 * that the cluster has been reconfigured and they should
830 * close any connections on this IP address
831 */
832 data.dptr = (uint8_t *)ctdb_addr_to_str(addr);
833 data.dsize = strlen((char *)data.dptr)+1;
834 DEBUG(DEBUG_INFO, ("Sending RELEASE_IP message for %s\n", data.dptr));
835 ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
836
837 ctdb_vnn_unassign_iface(ctdb, vnn);
838
839 /* Process the IP if it has been marked for deletion */
840 if (vnn->delete_pending) {
841 do_delete_ip(ctdb, vnn);
842 return NULL;
843 }
844
845 return vnn;
846}
847
848struct release_ip_callback_state {
849 struct ctdb_req_control_old *c;
850 ctdb_sock_addr *addr;
851 struct ctdb_vnn *vnn;
852 uint32_t target_pnn;
853};
854
855/*
856 called when releaseip event finishes
857 */
858static void release_ip_callback(struct ctdb_context *ctdb, int status,
859 void *private_data)
860{
861 struct release_ip_callback_state *state =
862 talloc_get_type(private_data, struct release_ip_callback_state);
863
864 if (status == -ETIME) {
865 ctdb_ban_self(ctdb);
866 }
867
868 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
869 if (ctdb_sys_have_ip(state->addr)) {
870 DEBUG(DEBUG_ERR,
871 ("IP %s still hosted during release IP callback, failing\n",
872 ctdb_addr_to_str(state->addr)));
873 ctdb_request_control_reply(ctdb, state->c,
874 NULL, -1, NULL);
875 talloc_free(state);
876 return;
877 }
878 }
879
880 state->vnn->pnn = state->target_pnn;
881 state->vnn = release_ip_post(ctdb, state->vnn, state->addr);
882
883 /* the control succeeded */
884 ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
885 talloc_free(state);
886}
887
888static int ctdb_releaseip_destructor(struct release_ip_callback_state *state)
889{
890 if (state->vnn != NULL) {
891 state->vnn->update_in_flight = false;
892 }
893 return 0;
894}
895
896/*
897 release an ip address
898 */
899int32_t ctdb_control_release_ip(struct ctdb_context *ctdb,
900 struct ctdb_req_control_old *c,
901 TDB_DATA indata,
902 bool *async_reply)
903{
904 int ret;
905 struct release_ip_callback_state *state;
906 struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
907 struct ctdb_vnn *vnn;
908 char *iface;
909
910 /* update our vnn list */
911 vnn = find_public_ip_vnn(ctdb, &pip->addr);
912 if (vnn == NULL) {
913 DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
914 ctdb_addr_to_str(&pip->addr)));
915 return 0;
916 }
917
918 /* stop any previous arps */
919 talloc_free(vnn->takeover_ctx);
920 vnn->takeover_ctx = NULL;
921
922 /* RELEASE_IP controls are sent to all nodes that should not
923 * be hosting a particular IP. This serves 2 purposes. The
924 * first is to help resolve any inconsistencies. If a node
925 * does unexpectly host an IP then it will be released. The
926 * 2nd is to use a "redundant release" to tell non-takeover
927 * nodes where an IP is moving to. This is how "ctdb ip" can
928 * report the (likely) location of an IP by only asking the
929 * local node. Redundant releases need to update the PNN but
930 * are otherwise ignored.
931 */
932 if (ctdb->tunable.disable_ip_failover == 0 && ctdb->do_checkpublicip) {
933 if (!ctdb_sys_have_ip(&pip->addr)) {
934 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
935 ctdb_addr_to_str(&pip->addr),
936 vnn->public_netmask_bits,
937 ctdb_vnn_iface_string(vnn)));
938 vnn->pnn = pip->pnn;
939 ctdb_vnn_unassign_iface(ctdb, vnn);
940 return 0;
941 }
942 } else {
943 if (vnn->iface == NULL) {
944 DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
945 ctdb_addr_to_str(&pip->addr),
946 vnn->public_netmask_bits));
947 vnn->pnn = pip->pnn;
948 return 0;
949 }
950 }
951
952 /* There is a potential race between take_ip and us because we
953 * update the VNN via a callback that run when the
954 * eventscripts have been run. Avoid the race by allowing one
955 * update to be in flight at a time.
956 */
957 if (vnn->update_in_flight) {
958 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
959 "update for this IP already in flight\n",
960 ctdb_addr_to_str(&vnn->public_address),
961 vnn->public_netmask_bits));
962 return -1;
963 }
964
965 iface = strdup(ctdb_vnn_iface_string(vnn));
966
967 DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s node:%d\n",
968 ctdb_addr_to_str(&pip->addr),
969 vnn->public_netmask_bits,
970 iface,
971 pip->pnn));
972
973 state = talloc(ctdb, struct release_ip_callback_state);
974 if (state == NULL) {
975 ctdb_set_error(ctdb, "Out of memory at %s:%d",
976 __FILE__, __LINE__);
977 free(iface);
978 return -1;
979 }
980
981 state->c = NULL;
982 state->addr = talloc(state, ctdb_sock_addr);
983 if (state->addr == NULL) {
984 ctdb_set_error(ctdb, "Out of memory at %s:%d",
985 __FILE__, __LINE__);
986 free(iface);
987 talloc_free(state);
988 return -1;
989 }
990 *state->addr = pip->addr;
991 state->target_pnn = pip->pnn;
992 state->vnn = vnn;
993
994 vnn->update_in_flight = true;
995 talloc_set_destructor(state, ctdb_releaseip_destructor);
996
997 ret = ctdb_event_script_callback(ctdb,
998 state, release_ip_callback, state,
999 CTDB_EVENT_RELEASE_IP,
1000 "%s %s %u",
1001 iface,
1002 ctdb_addr_to_str(&pip->addr),
1003 vnn->public_netmask_bits);
1004 free(iface);
1005 if (ret != 0) {
1006 DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
1007 ctdb_addr_to_str(&pip->addr),
1008 ctdb_vnn_iface_string(vnn)));
1009 talloc_free(state);
1010 return -1;
1011 }
1012
1013 /* tell the control that we will be reply asynchronously */
1014 *async_reply = true;
1015 state->c = talloc_steal(state, c);
1016 return 0;
1017}
1018
1019static int ctdb_add_public_address(struct ctdb_context *ctdb,
1020 ctdb_sock_addr *addr,
1021 unsigned mask, const char *ifaces,
1022 bool check_address)
1023{
1024 struct ctdb_vnn *vnn;
1025 uint32_t num = 0;
1026 char *tmp;
1027 const char *iface;
1028 int i;
1029 int ret;
1030
1031 tmp = strdup(ifaces);
1032 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1033 if (!ctdb_sys_check_iface_exists(iface)) {
1034 DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
1035 free(tmp);
1036 return -1;
1037 }
1038 }
1039 free(tmp);
1040
1041 /* Verify that we don't have an entry for this ip yet */
1042 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
1043 if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
1044 DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n",
1045 ctdb_addr_to_str(addr)));
1046 return -1;
1047 }
1048 }
1049
1050 /* create a new vnn structure for this ip address */
1051 vnn = talloc_zero(ctdb, struct ctdb_vnn);
1052 CTDB_NO_MEMORY_FATAL(ctdb, vnn);
1053 vnn->ifaces = talloc_array(vnn, const char *, num + 2);
1054 tmp = talloc_strdup(vnn, ifaces);
1055 CTDB_NO_MEMORY_FATAL(ctdb, tmp);
1056 for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
1057 vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
1058 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
1059 vnn->ifaces[num] = talloc_strdup(vnn, iface);
1060 CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
1061 num++;
1062 }
1063 talloc_free(tmp);
1064 vnn->ifaces[num] = NULL;
1065 vnn->public_address = *addr;
1066 vnn->public_netmask_bits = mask;
1067 vnn->pnn = -1;
1068 if (check_address) {
1069 if (ctdb_sys_have_ip(addr)) {
1070 DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
1071 vnn->pnn = ctdb->pnn;
1072 }
1073 }
1074
1075 for (i=0; vnn->ifaces[i]; i++) {
1076 ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
1077 if (ret != 0) {
1078 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1079 "for public_address[%s]\n",
1080 vnn->ifaces[i], ctdb_addr_to_str(addr)));
1081 talloc_free(vnn);
1082 return -1;
1083 }
1084 }
1085
1086 DLIST_ADD(ctdb->vnn, vnn);
1087
1088 return 0;
1089}
1090
1091/*
1092 setup the public address lists from a file
1093*/
1094int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
1095{
1096 char **lines;
1097 int nlines;
1098 int i;
1099
1100 lines = file_lines_load(ctdb->public_addresses_file, &nlines, 0, ctdb);
1101 if (lines == NULL) {
1102 ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
1103 return -1;
1104 }
1105 while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
1106 nlines--;
1107 }
1108
1109 for (i=0;i<nlines;i++) {
1110 unsigned mask;
1111 ctdb_sock_addr addr;
1112 const char *addrstr;
1113 const char *ifaces;
1114 char *tok, *line;
1115
1116 line = lines[i];
1117 while ((*line == ' ') || (*line == '\t')) {
1118 line++;
1119 }
1120 if (*line == '#') {
1121 continue;
1122 }
1123 if (strcmp(line, "") == 0) {
1124 continue;
1125 }
1126 tok = strtok(line, " \t");
1127 addrstr = tok;
1128 tok = strtok(NULL, " \t");
1129 if (tok == NULL) {
1130 if (NULL == ctdb->default_public_interface) {
1131 DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
1132 i+1));
1133 talloc_free(lines);
1134 return -1;
1135 }
1136 ifaces = ctdb->default_public_interface;
1137 } else {
1138 ifaces = tok;
1139 }
1140
1141 if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
1142 DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
1143 talloc_free(lines);
1144 return -1;
1145 }
1146 if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
1147 DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
1148 talloc_free(lines);
1149 return -1;
1150 }
1151 }
1152
1153
1154 talloc_free(lines);
1155 return 0;
1156}
1157
1158int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
1159 const char *iface,
1160 const char *ip)
1161{
1162 struct ctdb_vnn *svnn;
1163 struct ctdb_interface *cur = NULL;
1164 bool ok;
1165 int ret;
1166
1167 svnn = talloc_zero(ctdb, struct ctdb_vnn);
1168 CTDB_NO_MEMORY(ctdb, svnn);
1169
1170 svnn->ifaces = talloc_array(svnn, const char *, 2);
1171 CTDB_NO_MEMORY(ctdb, svnn->ifaces);
1172 svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
1173 CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
1174 svnn->ifaces[1] = NULL;
1175
1176 ok = parse_ip(ip, iface, 0, &svnn->public_address);
1177 if (!ok) {
1178 talloc_free(svnn);
1179 return -1;
1180 }
1181
1182 ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
1183 if (ret != 0) {
1184 DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
1185 "for single_ip[%s]\n",
1186 svnn->ifaces[0],
1187 ctdb_addr_to_str(&svnn->public_address)));
1188 talloc_free(svnn);
1189 return -1;
1190 }
1191
1192 /* assume the single public ip interface is initially "good" */
1193 cur = ctdb_find_iface(ctdb, iface);
1194 if (cur == NULL) {
1195 DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
1196 return -1;
1197 }
1198 cur->link_up = true;
1199
1200 ret = ctdb_vnn_assign_iface(ctdb, svnn);
1201 if (ret != 0) {
1202 talloc_free(svnn);
1203 return -1;
1204 }
1205
1206 ctdb->single_ip_vnn = svnn;
1207 return 0;
1208}
1209
1210static void *add_ip_callback(void *parm, void *data)
1211{
1212 struct public_ip_list *this_ip = parm;
1213 struct public_ip_list *prev_ip = data;
1214
1215 if (prev_ip == NULL) {
1216 return parm;
1217 }
1218 if (this_ip->pnn == -1) {
1219 this_ip->pnn = prev_ip->pnn;
1220 }
1221
1222 return parm;
1223}
1224
1225static int getips_count_callback(void *param, void *data)
1226{
1227 struct public_ip_list **ip_list = (struct public_ip_list **)param;
1228 struct public_ip_list *new_ip = (struct public_ip_list *)data;
1229
1230 new_ip->next = *ip_list;
1231 *ip_list = new_ip;
1232 return 0;
1233}
1234
1235static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
1236 struct ctdb_public_ip_list_old *ips,
1237 uint32_t pnn);
1238
1239static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
1240 struct ipalloc_state *ipalloc_state,
1241 struct ctdb_node_map_old *nodemap)
1242{
1243 int j;
1244 int ret;
1245
1246 if (ipalloc_state->num != nodemap->num) {
1247 DEBUG(DEBUG_ERR,
1248 (__location__
1249 " ipalloc_state->num (%d) != nodemap->num (%d) invalid param\n",
1250 ipalloc_state->num, nodemap->num));
1251 return -1;
1252 }
1253
1254 for (j=0; j<nodemap->num; j++) {
1255 if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
1256 continue;
1257 }
1258
1259 /* Retrieve the list of known public IPs from the node */
1260 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1261 TAKEOVER_TIMEOUT(),
1262 j,
1263 ipalloc_state->known_public_ips,
1264 0,
1265 &ipalloc_state->known_public_ips[j]);
1266 if (ret != 0) {
1267 DEBUG(DEBUG_ERR,
1268 ("Failed to read known public IPs from node: %u\n",
1269 j));
1270 return -1;
1271 }
1272
1273 if (ctdb->do_checkpublicip) {
1274 verify_remote_ip_allocation(ctdb,
1275 ipalloc_state->known_public_ips[j],
1276 j);
1277 }
1278
1279 /* Retrieve the list of available public IPs from the node */
1280 ret = ctdb_ctrl_get_public_ips_flags(ctdb,
1281 TAKEOVER_TIMEOUT(),
1282 j,
1283 ipalloc_state->available_public_ips,
1284 CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
1285 &ipalloc_state->available_public_ips[j]);
1286 if (ret != 0) {
1287 DEBUG(DEBUG_ERR,
1288 ("Failed to read available public IPs from node: %u\n",
1289 j));
1290 return -1;
1291 }
1292 }
1293
1294 return 0;
1295}
1296
1297static struct public_ip_list *
1298create_merged_ip_list(struct ctdb_context *ctdb, struct ipalloc_state *ipalloc_state)
1299{
1300 int i, j;
1301 struct public_ip_list *ip_list;
1302 struct ctdb_public_ip_list_old *public_ips;
1303
1304 TALLOC_FREE(ctdb->ip_tree);
1305 ctdb->ip_tree = trbt_create(ctdb, 0);
1306
1307 for (i=0; i < ctdb->num_nodes; i++) {
1308 public_ips = ipalloc_state->known_public_ips[i];
1309
1310 if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
1311 continue;
1312 }
1313
1314 /* there were no public ips for this node */
1315 if (public_ips == NULL) {
1316 continue;
1317 }
1318
1319 for (j=0; j < public_ips->num; j++) {
1320 struct public_ip_list *tmp_ip;
1321
1322 tmp_ip = talloc_zero(ctdb->ip_tree, struct public_ip_list);
1323 CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
1324 /* Do not use information about IP addresses hosted
1325 * on other nodes, it may not be accurate */
1326 if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
1327 tmp_ip->pnn = public_ips->ips[j].pnn;
1328 } else {
1329 tmp_ip->pnn = -1;
1330 }
1331 tmp_ip->addr = public_ips->ips[j].addr;
1332 tmp_ip->next = NULL;
1333
1334 trbt_insertarray32_callback(ctdb->ip_tree,
1335 IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
1336 add_ip_callback,
1337 tmp_ip);
1338 }
1339 }
1340
1341 ip_list = NULL;
1342 trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
1343
1344 return ip_list;
1345}
1346
1347static bool all_nodes_are_disabled(struct ctdb_node_map_old *nodemap)
1348{
1349 int i;
1350
1351 for (i=0;i<nodemap->num;i++) {
1352 if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
1353 /* Found one completely healthy node */
1354 return false;
1355 }
1356 }
1357
1358 return true;
1359}
1360
1361struct get_tunable_callback_data {
1362 const char *tunable;
1363 uint32_t *out;
1364 bool fatal;
1365};
1366
1367static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
1368 int32_t res, TDB_DATA outdata,
1369 void *callback)
1370{
1371 struct get_tunable_callback_data *cd =
1372 (struct get_tunable_callback_data *)callback;
1373 int size;
1374
1375 if (res != 0) {
1376 /* Already handled in fail callback */
1377 return;
1378 }
1379
1380 if (outdata.dsize != sizeof(uint32_t)) {
1381 DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
1382 cd->tunable, pnn, (int)sizeof(uint32_t),
1383 (int)outdata.dsize));
1384 cd->fatal = true;
1385 return;
1386 }
1387
1388 size = talloc_array_length(cd->out);
1389 if (pnn >= size) {
1390 DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
1391 cd->tunable, pnn, size));
1392 return;
1393 }
1394
1395
1396 cd->out[pnn] = *(uint32_t *)outdata.dptr;
1397}
1398
1399static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1400 int32_t res, TDB_DATA outdata,
1401 void *callback)
1402{
1403 struct get_tunable_callback_data *cd =
1404 (struct get_tunable_callback_data *)callback;
1405
1406 switch (res) {
1407 case -ETIME:
1408 DEBUG(DEBUG_ERR,
1409 ("Timed out getting tunable \"%s\" from node %d\n",
1410 cd->tunable, pnn));
1411 cd->fatal = true;
1412 break;
1413 case -EINVAL:
1414 case -1:
1415 DEBUG(DEBUG_WARNING,
1416 ("Tunable \"%s\" not implemented on node %d\n",
1417 cd->tunable, pnn));
1418 break;
1419 default:
1420 DEBUG(DEBUG_ERR,
1421 ("Unexpected error getting tunable \"%s\" from node %d\n",
1422 cd->tunable, pnn));
1423 cd->fatal = true;
1424 }
1425}
1426
1427static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
1428 TALLOC_CTX *tmp_ctx,
1429 struct ctdb_node_map_old *nodemap,
1430 const char *tunable,
1431 uint32_t default_value)
1432{
1433 TDB_DATA data;
1434 struct ctdb_control_get_tunable *t;
1435 uint32_t *nodes;
1436 uint32_t *tvals;
1437 struct get_tunable_callback_data callback_data;
1438 int i;
1439
1440 tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
1441 CTDB_NO_MEMORY_NULL(ctdb, tvals);
1442 for (i=0; i<nodemap->num; i++) {
1443 tvals[i] = default_value;
1444 }
1445
1446 callback_data.out = tvals;
1447 callback_data.tunable = tunable;
1448 callback_data.fatal = false;
1449
1450 data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
1451 data.dptr = talloc_size(tmp_ctx, data.dsize);
1452 t = (struct ctdb_control_get_tunable *)data.dptr;
1453 t->length = strlen(tunable)+1;
1454 memcpy(t->name, tunable, t->length);
1455 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1456 if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
1457 nodes, 0, TAKEOVER_TIMEOUT(),
1458 false, data,
1459 get_tunable_callback,
1460 get_tunable_fail_callback,
1461 &callback_data) != 0) {
1462 if (callback_data.fatal) {
1463 talloc_free(tvals);
1464 tvals = NULL;
1465 }
1466 }
1467 talloc_free(nodes);
1468 talloc_free(data.dptr);
1469
1470 return tvals;
1471}
1472
1473/* Set internal flags for IP allocation:
1474 * Clear ip flags
1475 * Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
1476 * Set NOIPHOST ip flag for each INACTIVE node
1477 * if all nodes are disabled:
1478 * Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
1479 * else
1480 * Set NOIPHOST ip flags for disabled nodes
1481 */
1482static void set_ipflags_internal(struct ipalloc_state *ipalloc_state,
1483 struct ctdb_node_map_old *nodemap,
1484 uint32_t *tval_noiptakeover,
1485 uint32_t *tval_noiphostonalldisabled)
1486{
1487 int i;
1488
1489 for (i=0;i<nodemap->num;i++) {
1490 /* Can not take IPs on node with NoIPTakeover set */
1491 if (tval_noiptakeover[i] != 0) {
1492 ipalloc_state->noiptakeover[i] = true;
1493 }
1494
1495 /* Can not host IPs on INACTIVE node */
1496 if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
1497 ipalloc_state->noiphost[i] = true;
1498 }
1499 }
1500
1501 if (all_nodes_are_disabled(nodemap)) {
1502 /* If all nodes are disabled, can not host IPs on node
1503 * with NoIPHostOnAllDisabled set
1504 */
1505 for (i=0;i<nodemap->num;i++) {
1506 if (tval_noiphostonalldisabled[i] != 0) {
1507 ipalloc_state->noiphost[i] = true;
1508 }
1509 }
1510 } else {
1511 /* If some nodes are not disabled, then can not host
1512 * IPs on DISABLED node
1513 */
1514 for (i=0;i<nodemap->num;i++) {
1515 if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
1516 ipalloc_state->noiphost[i] = true;
1517 }
1518 }
1519 }
1520}
1521
1522static bool set_ipflags(struct ctdb_context *ctdb,
1523 struct ipalloc_state *ipalloc_state,
1524 struct ctdb_node_map_old *nodemap)
1525{
1526 uint32_t *tval_noiptakeover;
1527 uint32_t *tval_noiphostonalldisabled;
1528
1529 tval_noiptakeover = get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1530 "NoIPTakeover", 0);
1531 if (tval_noiptakeover == NULL) {
1532 return false;
1533 }
1534
1535 tval_noiphostonalldisabled =
1536 get_tunable_from_nodes(ctdb, ipalloc_state, nodemap,
1537 "NoIPHostOnAllDisabled", 0);
1538 if (tval_noiphostonalldisabled == NULL) {
1539 /* Caller frees tmp_ctx */
1540 return false;
1541 }
1542
1543 set_ipflags_internal(ipalloc_state, nodemap,
1544 tval_noiptakeover,
1545 tval_noiphostonalldisabled);
1546
1547 talloc_free(tval_noiptakeover);
1548 talloc_free(tval_noiphostonalldisabled);
1549
1550 return true;
1551}
1552
1553static struct ipalloc_state * ipalloc_state_init(struct ctdb_context *ctdb,
1554 TALLOC_CTX *mem_ctx)
1555{
1556 struct ipalloc_state *ipalloc_state =
1557 talloc_zero(mem_ctx, struct ipalloc_state);
1558 if (ipalloc_state == NULL) {
1559 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1560 return NULL;
1561 }
1562
1563 ipalloc_state->num = ctdb->num_nodes;
1564 ipalloc_state->known_public_ips =
1565 talloc_zero_array(ipalloc_state,
1566 struct ctdb_public_ip_list_old *,
1567 ipalloc_state->num);
1568 if (ipalloc_state->known_public_ips == NULL) {
1569 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1570 talloc_free(ipalloc_state);
1571 return NULL;
1572 }
1573 ipalloc_state->available_public_ips =
1574 talloc_zero_array(ipalloc_state,
1575 struct ctdb_public_ip_list_old *,
1576 ipalloc_state->num);
1577 if (ipalloc_state->available_public_ips == NULL) {
1578 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1579 talloc_free(ipalloc_state);
1580 return NULL;
1581 }
1582 ipalloc_state->noiptakeover =
1583 talloc_zero_array(ipalloc_state,
1584 bool,
1585 ipalloc_state->num);
1586 if (ipalloc_state->noiptakeover == NULL) {
1587 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1588 talloc_free(ipalloc_state);
1589 return NULL;
1590 }
1591 ipalloc_state->noiphost =
1592 talloc_zero_array(ipalloc_state,
1593 bool,
1594 ipalloc_state->num);
1595 if (ipalloc_state->noiphost == NULL) {
1596 DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
1597 talloc_free(ipalloc_state);
1598 return NULL;
1599 }
1600
1601 if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
1602 ipalloc_state->algorithm = IPALLOC_LCP2;
1603 } else if (1 == ctdb->tunable.deterministic_public_ips) {
1604 ipalloc_state->algorithm = IPALLOC_DETERMINISTIC;
1605 } else {
1606 ipalloc_state->algorithm = IPALLOC_NONDETERMINISTIC;
1607 }
1608
1609 ipalloc_state->no_ip_failback = ctdb->tunable.no_ip_failback;
1610
1611 return ipalloc_state;
1612}
1613
1614struct iprealloc_callback_data {
1615 bool *retry_nodes;
1616 int retry_count;
1617 client_async_callback fail_callback;
1618 void *fail_callback_data;
1619 struct ctdb_node_map_old *nodemap;
1620};
1621
1622static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
1623 int32_t res, TDB_DATA outdata,
1624 void *callback)
1625{
1626 int numnodes;
1627 struct iprealloc_callback_data *cd =
1628 (struct iprealloc_callback_data *)callback;
1629
1630 numnodes = talloc_array_length(cd->retry_nodes);
1631 if (pnn > numnodes) {
1632 DEBUG(DEBUG_ERR,
1633 ("ipreallocated failure from node %d, "
1634 "but only %d nodes in nodemap\n",
1635 pnn, numnodes));
1636 return;
1637 }
1638
1639 /* Can't run the "ipreallocated" event on a INACTIVE node */
1640 if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
1641 DEBUG(DEBUG_WARNING,
1642 ("ipreallocated failed on inactive node %d, ignoring\n",
1643 pnn));
1644 return;
1645 }
1646
1647 switch (res) {
1648 case -ETIME:
1649 /* If the control timed out then that's a real error,
1650 * so call the real fail callback
1651 */
1652 if (cd->fail_callback) {
1653 cd->fail_callback(ctdb, pnn, res, outdata,
1654 cd->fail_callback_data);
1655 } else {
1656 DEBUG(DEBUG_WARNING,
1657 ("iprealloc timed out but no callback registered\n"));
1658 }
1659 break;
1660 default:
1661 /* If not a timeout then either the ipreallocated
1662 * eventscript (or some setup) failed. This might
1663 * have failed because the IPREALLOCATED control isn't
1664 * implemented - right now there is no way of knowing
1665 * because the error codes are all folded down to -1.
1666 * Consider retrying using EVENTSCRIPT control...
1667 */
1668 DEBUG(DEBUG_WARNING,
1669 ("ipreallocated failure from node %d, flagging retry\n",
1670 pnn));
1671 cd->retry_nodes[pnn] = true;
1672 cd->retry_count++;
1673 }
1674}
1675
1676struct takeover_callback_data {
1677 bool *node_failed;
1678 client_async_callback fail_callback;
1679 void *fail_callback_data;
1680 struct ctdb_node_map_old *nodemap;
1681};
1682
1683static void takeover_run_fail_callback(struct ctdb_context *ctdb,
1684 uint32_t node_pnn, int32_t res,
1685 TDB_DATA outdata, void *callback_data)
1686{
1687 struct takeover_callback_data *cd =
1688 talloc_get_type_abort(callback_data,
1689 struct takeover_callback_data);
1690 int i;
1691
1692 for (i = 0; i < cd->nodemap->num; i++) {
1693 if (node_pnn == cd->nodemap->nodes[i].pnn) {
1694 break;
1695 }
1696 }
1697
1698 if (i == cd->nodemap->num) {
1699 DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
1700 return;
1701 }
1702
1703 if (!cd->node_failed[i]) {
1704 cd->node_failed[i] = true;
1705 cd->fail_callback(ctdb, node_pnn, res, outdata,
1706 cd->fail_callback_data);
1707 }
1708}
1709
1710/*
1711 * Recalculate the allocation of public IPs to nodes and have the
1712 * nodes host their allocated addresses.
1713 *
1714 * - Allocate memory for IP allocation state, including per node
1715 * arrays
1716 * - Populate IP allocation algorithm in IP allocation state
1717 * - Populate local value of tunable NoIPFailback in IP allocation
1718 state - this is really a cluster-wide configuration variable and
1719 only the value form the master node is used
1720 * - Retrieve tunables NoIPTakeover and NoIPHostOnAllDisabled from all
1721 * connected nodes - this is done separately so tunable values can
1722 * be faked in unit testing
1723 * - Populate NoIPTakover tunable in IP allocation state
1724 * - Populate NoIPHost in IP allocation state, derived from node flags
1725 * and NoIPHostOnAllDisabled tunable
1726 * - Retrieve and populate known and available IP lists in IP
1727 * allocation state
1728 * - If no available IP addresses then early exit
1729 * - Build list of (known IPs, currently assigned node)
1730 * - Populate list of nodes to force rebalance - internal structure,
1731 * currently no way to fetch, only used by LCP2 for nodes that have
1732 * had new IP addresses added
1733 * - Run IP allocation algorithm
1734 * - Send RELEASE_IP to all nodes for IPs they should not host
1735 * - Send TAKE_IP to all nodes for IPs they should host
1736 * - Send IPREALLOCATED to all nodes (with backward compatibility hack)
1737 */
1738int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map_old *nodemap,
1739 uint32_t *force_rebalance_nodes,
1740 client_async_callback fail_callback, void *callback_data)
1741{
1742 int i, j, ret;
1743 struct ctdb_public_ip ip;
1744 uint32_t *nodes;
1745 struct public_ip_list *all_ips, *tmp_ip;
1746 TDB_DATA data;
1747 struct timeval timeout;
1748 struct client_async_data *async_data;
1749 struct ctdb_client_control_state *state;
1750 TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
1751 struct ipalloc_state *ipalloc_state;
1752 struct takeover_callback_data *takeover_data;
1753 struct iprealloc_callback_data iprealloc_data;
1754 bool *retry_data;
1755 bool can_host_ips;
1756
1757 /* Default timeout for early jump to IPREALLOCATED. See below
1758 * for explanation of 3 times... */
1759 timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1760
1761 /*
1762 * ip failover is completely disabled, just send out the
1763 * ipreallocated event.
1764 */
1765 if (ctdb->tunable.disable_ip_failover != 0) {
1766 goto ipreallocated;
1767 }
1768
1769 ipalloc_state = ipalloc_state_init(ctdb, tmp_ctx);
1770 if (ipalloc_state == NULL) {
1771 talloc_free(tmp_ctx);
1772 return -1;
1773 }
1774
1775 if (!set_ipflags(ctdb, ipalloc_state, nodemap)) {
1776 DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
1777 talloc_free(tmp_ctx);
1778 return -1;
1779 }
1780
1781 /* Fetch known/available public IPs from each active node */
1782 ret = ctdb_reload_remote_public_ips(ctdb, ipalloc_state, nodemap);
1783 if (ret != 0) {
1784 talloc_free(tmp_ctx);
1785 return -1;
1786 }
1787
1788 /* Short-circuit IP allocation if no node has available IPs */
1789 can_host_ips = false;
1790 for (i=0; i < ipalloc_state->num; i++) {
1791 if (ipalloc_state->available_public_ips[i] != NULL) {
1792 can_host_ips = true;
1793 }
1794 }
1795 if (!can_host_ips) {
1796 DEBUG(DEBUG_WARNING,("No nodes available to host public IPs yet\n"));
1797 return 0;
1798 }
1799
1800 /* since nodes only know about those public addresses that
1801 can be served by that particular node, no single node has
1802 a full list of all public addresses that exist in the cluster.
1803 Walk over all node structures and create a merged list of
1804 all public addresses that exist in the cluster.
1805
1806 keep the tree of ips around as ctdb->ip_tree
1807 */
1808 all_ips = create_merged_ip_list(ctdb, ipalloc_state);
1809 ipalloc_state->all_ips = all_ips;
1810
1811 ipalloc_state->force_rebalance_nodes = force_rebalance_nodes;
1812
1813 /* Do the IP reassignment calculations */
1814 ipalloc(ipalloc_state);
1815
1816 /* Now tell all nodes to release any public IPs should not
1817 * host. This will be a NOOP on nodes that don't currently
1818 * hold the given IP.
1819 */
1820 takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
1821 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
1822
1823 takeover_data->node_failed = talloc_zero_array(tmp_ctx,
1824 bool, nodemap->num);
1825 CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
1826 takeover_data->fail_callback = fail_callback;
1827 takeover_data->fail_callback_data = callback_data;
1828 takeover_data->nodemap = nodemap;
1829
1830 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1831 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1832
1833 async_data->fail_callback = takeover_run_fail_callback;
1834 async_data->callback_data = takeover_data;
1835
1836 ZERO_STRUCT(ip); /* Avoid valgrind warnings for union */
1837
1838 /* Each of the following stages (RELEASE_IP, TAKEOVER_IP,
1839 * IPREALLOCATED) notionally has a timeout of TakeoverTimeout
1840 * seconds. However, RELEASE_IP can take longer due to TCP
1841 * connection killing, so sometimes needs more time.
1842 * Therefore, use a cumulative timeout of TakeoverTimeout * 3
1843 * seconds across all 3 stages. No explicit expiry checks are
1844 * needed before each stage because tevent is smart enough to
1845 * fire the timeouts even if they are in the past. Initialise
1846 * this here so it explicitly covers the stages we're
1847 * interested in but, in particular, not the time taken by the
1848 * ipalloc().
1849 */
1850 timeout = timeval_current_ofs(3 * ctdb->tunable.takeover_timeout, 0);
1851
1852 /* Send a RELEASE_IP to all nodes that should not be hosting
1853 * each IP. For each IP, all but one of these will be
1854 * redundant. However, the redundant ones are used to tell
1855 * nodes which node should be hosting the IP so that commands
1856 * like "ctdb ip" can display a particular nodes idea of who
1857 * is hosting what. */
1858 for (i=0;i<nodemap->num;i++) {
1859 /* don't talk to unconnected nodes, but do talk to banned nodes */
1860 if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
1861 continue;
1862 }
1863
1864 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1865 if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
1866 /* This node should be serving this
1867 vnn so don't tell it to release the ip
1868 */
1869 continue;
1870 }
1871 ip.pnn = tmp_ip->pnn;
1872 ip.addr = tmp_ip->addr;
1873
1874 data.dsize = sizeof(ip);
1875 data.dptr = (uint8_t *)&ip;
1876 state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
1877 0, CTDB_CONTROL_RELEASE_IP, 0,
1878 data, async_data,
1879 &timeout, NULL);
1880 if (state == NULL) {
1881 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
1882 talloc_free(tmp_ctx);
1883 return -1;
1884 }
1885
1886 ctdb_client_async_add(async_data, state);
1887 }
1888 }
1889 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1890 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
1891 talloc_free(tmp_ctx);
1892 return -1;
1893 }
1894 talloc_free(async_data);
1895
1896
1897 /* For each IP, send a TAKOVER_IP to the node that should be
1898 * hosting it. Many of these will often be redundant (since
1899 * the allocation won't have changed) but they can be useful
1900 * to recover from inconsistencies. */
1901 async_data = talloc_zero(tmp_ctx, struct client_async_data);
1902 CTDB_NO_MEMORY_FATAL(ctdb, async_data);
1903
1904 async_data->fail_callback = fail_callback;
1905 async_data->callback_data = callback_data;
1906
1907 for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
1908 if (tmp_ip->pnn == -1) {
1909 /* this IP won't be taken over */
1910 continue;
1911 }
1912
1913 ip.pnn = tmp_ip->pnn;
1914 ip.addr = tmp_ip->addr;
1915
1916 data.dsize = sizeof(ip);
1917 data.dptr = (uint8_t *)&ip;
1918 state = ctdb_control_send(ctdb, tmp_ip->pnn,
1919 0, CTDB_CONTROL_TAKEOVER_IP, 0,
1920 data, async_data, &timeout, NULL);
1921 if (state == NULL) {
1922 DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
1923 talloc_free(tmp_ctx);
1924 return -1;
1925 }
1926
1927 ctdb_client_async_add(async_data, state);
1928 }
1929 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
1930 DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
1931 talloc_free(tmp_ctx);
1932 return -1;
1933 }
1934
1935ipreallocated:
1936 /*
1937 * Tell all nodes to run eventscripts to process the
1938 * "ipreallocated" event. This can do a lot of things,
1939 * including restarting services to reconfigure them if public
1940 * IPs have moved. Once upon a time this event only used to
1941 * update natgw.
1942 */
1943 retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
1944 CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
1945 iprealloc_data.retry_nodes = retry_data;
1946 iprealloc_data.retry_count = 0;
1947 iprealloc_data.fail_callback = fail_callback;
1948 iprealloc_data.fail_callback_data = callback_data;
1949 iprealloc_data.nodemap = nodemap;
1950
1951 nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
1952 ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
1953 nodes, 0, timeout,
1954 false, tdb_null,
1955 NULL, iprealloc_fail_callback,
1956 &iprealloc_data);
1957 if (ret != 0) {
1958 /* If the control failed then we should retry to any
1959 * nodes flagged by iprealloc_fail_callback using the
1960 * EVENTSCRIPT control. This is a best-effort at
1961 * backward compatiblity when running a mixed cluster
1962 * where some nodes have not yet been upgraded to
1963 * support the IPREALLOCATED control.
1964 */
1965 DEBUG(DEBUG_WARNING,
1966 ("Retry ipreallocated to some nodes using eventscript control\n"));
1967
1968 nodes = talloc_array(tmp_ctx, uint32_t,
1969 iprealloc_data.retry_count);
1970 CTDB_NO_MEMORY_FATAL(ctdb, nodes);
1971
1972 j = 0;
1973 for (i=0; i<nodemap->num; i++) {
1974 if (iprealloc_data.retry_nodes[i]) {
1975 nodes[j] = i;
1976 j++;
1977 }
1978 }
1979
1980 data.dptr = discard_const("ipreallocated");
1981 data.dsize = strlen((char *)data.dptr) + 1;
1982 ret = ctdb_client_async_control(ctdb,
1983 CTDB_CONTROL_RUN_EVENTSCRIPTS,
1984 nodes, 0, TAKEOVER_TIMEOUT(),
1985 false, data,
1986 NULL, fail_callback,
1987 callback_data);
1988 if (ret != 0) {
1989 DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
1990 }
1991 }
1992
1993 talloc_free(tmp_ctx);
1994 return ret;
1995}
1996
1997
1998/*
1999 destroy a ctdb_client_ip structure
2000 */
2001static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
2002{
2003 DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
2004 ctdb_addr_to_str(&ip->addr),
2005 ntohs(ip->addr.ip.sin_port),
2006 ip->client_id));
2007
2008 DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
2009 return 0;
2010}
2011
2012/*
2013 called by a client to inform us of a TCP connection that it is managing
2014 that should tickled with an ACK when IP takeover is done
2015 */
2016int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
2017 TDB_DATA indata)
2018{
2019 struct ctdb_client *client = reqid_find(ctdb->idr, client_id, struct ctdb_client);
2020 struct ctdb_connection *tcp_sock = NULL;
2021 struct ctdb_tcp_list *tcp;
2022 struct ctdb_connection t;
2023 int ret;
2024 TDB_DATA data;
2025 struct ctdb_client_ip *ip;
2026 struct ctdb_vnn *vnn;
2027 ctdb_sock_addr addr;
2028
2029 /* If we don't have public IPs, tickles are useless */
2030 if (ctdb->vnn == NULL) {
2031 return 0;
2032 }
2033
2034 tcp_sock = (struct ctdb_connection *)indata.dptr;
2035
2036 addr = tcp_sock->src;
2037 ctdb_canonicalize_ip(&addr, &tcp_sock->src);
2038 addr = tcp_sock->dst;
2039 ctdb_canonicalize_ip(&addr, &tcp_sock->dst);
2040
2041 ZERO_STRUCT(addr);
2042 memcpy(&addr, &tcp_sock->dst, sizeof(addr));
2043 vnn = find_public_ip_vnn(ctdb, &addr);
2044 if (vnn == NULL) {
2045 switch (addr.sa.sa_family) {
2046 case AF_INET:
2047 if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
2048 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n",
2049 ctdb_addr_to_str(&addr)));
2050 }
2051 break;
2052 case AF_INET6:
2053 DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n",
2054 ctdb_addr_to_str(&addr)));
2055 break;
2056 default:
2057 DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
2058 }
2059
2060 return 0;
2061 }
2062
2063 if (vnn->pnn != ctdb->pnn) {
2064 DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
2065 ctdb_addr_to_str(&addr),
2066 client_id, client->pid));
2067 /* failing this call will tell smbd to die */
2068 return -1;
2069 }
2070
2071 ip = talloc(client, struct ctdb_client_ip);
2072 CTDB_NO_MEMORY(ctdb, ip);
2073
2074 ip->ctdb = ctdb;
2075 ip->addr = addr;
2076 ip->client_id = client_id;
2077 talloc_set_destructor(ip, ctdb_client_ip_destructor);
2078 DLIST_ADD(ctdb->client_ip_list, ip);
2079
2080 tcp = talloc(client, struct ctdb_tcp_list);
2081 CTDB_NO_MEMORY(ctdb, tcp);
2082
2083 tcp->connection.src = tcp_sock->src;
2084 tcp->connection.dst = tcp_sock->dst;
2085
2086 DLIST_ADD(client->tcp_list, tcp);
2087
2088 t.src = tcp_sock->src;
2089 t.dst = tcp_sock->dst;
2090
2091 data.dptr = (uint8_t *)&t;
2092 data.dsize = sizeof(t);
2093
2094 switch (addr.sa.sa_family) {
2095 case AF_INET:
2096 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2097 (unsigned)ntohs(tcp_sock->dst.ip.sin_port),
2098 ctdb_addr_to_str(&tcp_sock->src),
2099 (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
2100 break;
2101 case AF_INET6:
2102 DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
2103 (unsigned)ntohs(tcp_sock->dst.ip6.sin6_port),
2104 ctdb_addr_to_str(&tcp_sock->src),
2105 (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
2106 break;
2107 default:
2108 DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
2109 }
2110
2111
2112 /* tell all nodes about this tcp connection */
2113 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0,
2114 CTDB_CONTROL_TCP_ADD,
2115 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
2116 if (ret != 0) {
2117 DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
2118 return -1;
2119 }
2120
2121 return 0;
2122}
2123
2124/*
2125 find a tcp address on a list
2126 */
2127static struct ctdb_connection *ctdb_tcp_find(struct ctdb_tcp_array *array,
2128 struct ctdb_connection *tcp)
2129{
2130 int i;
2131
2132 if (array == NULL) {
2133 return NULL;
2134 }
2135
2136 for (i=0;i<array->num;i++) {
2137 if (ctdb_same_sockaddr(&array->connections[i].src, &tcp->src) &&
2138 ctdb_same_sockaddr(&array->connections[i].dst, &tcp->dst)) {
2139 return &array->connections[i];
2140 }
2141 }
2142 return NULL;
2143}
2144
2145
2146
2147/*
2148 called by a daemon to inform us of a TCP connection that one of its
2149 clients managing that should tickled with an ACK when IP takeover is
2150 done
2151 */
2152int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
2153{
2154 struct ctdb_connection *p = (struct ctdb_connection *)indata.dptr;
2155 struct ctdb_tcp_array *tcparray;
2156 struct ctdb_connection tcp;
2157 struct ctdb_vnn *vnn;
2158
2159 /* If we don't have public IPs, tickles are useless */
2160 if (ctdb->vnn == NULL) {
2161 return 0;
2162 }
2163
2164 vnn = find_public_ip_vnn(ctdb, &p->dst);
2165 if (vnn == NULL) {
2166 DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
2167 ctdb_addr_to_str(&p->dst)));
2168
2169 return -1;
2170 }
2171
2172
2173 tcparray = vnn->tcp_array;
2174
2175 /* If this is the first tickle */
2176 if (tcparray == NULL) {
2177 tcparray = talloc(vnn, struct ctdb_tcp_array);
2178 CTDB_NO_MEMORY(ctdb, tcparray);
2179 vnn->tcp_array = tcparray;
2180
2181 tcparray->num = 0;
2182 tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_connection));
2183 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2184
2185 tcparray->connections[tcparray->num].src = p->src;
2186 tcparray->connections[tcparray->num].dst = p->dst;
2187 tcparray->num++;
2188
2189 if (tcp_update_needed) {
2190 vnn->tcp_update_needed = true;
2191 }
2192 return 0;
2193 }
2194
2195
2196 /* Do we already have this tickle ?*/
2197 tcp.src = p->src;
2198 tcp.dst = p->dst;
2199 if (ctdb_tcp_find(tcparray, &tcp) != NULL) {
2200 DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
2201 ctdb_addr_to_str(&tcp.dst),
2202 ntohs(tcp.dst.ip.sin_port),
2203 vnn->pnn));
2204 return 0;
2205 }
2206
2207 /* A new tickle, we must add it to the array */
2208 tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
2209 struct ctdb_connection,
2210 tcparray->num+1);
2211 CTDB_NO_MEMORY(ctdb, tcparray->connections);
2212
2213 tcparray->connections[tcparray->num].src = p->src;
2214 tcparray->connections[tcparray->num].dst = p->dst;
2215 tcparray->num++;
2216
2217 DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
2218 ctdb_addr_to_str(&tcp.dst),
2219 ntohs(tcp.dst.ip.sin_port),
2220 vnn->pnn));
2221
2222 if (tcp_update_needed) {
2223 vnn->tcp_update_needed = true;
2224 }
2225
2226 return 0;
2227}
2228
2229
2230static void ctdb_remove_connection(struct ctdb_vnn *vnn, struct ctdb_connection *conn)
2231{
2232 struct ctdb_connection *tcpp;
2233
2234 if (vnn == NULL) {
2235 return;
2236 }
2237
2238 /* if the array is empty we cant remove it
2239 and we don't need to do anything
2240 */
2241 if (vnn->tcp_array == NULL) {
2242 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
2243 ctdb_addr_to_str(&conn->dst),
2244 ntohs(conn->dst.ip.sin_port)));
2245 return;
2246 }
2247
2248
2249 /* See if we know this connection
2250 if we don't know this connection then we dont need to do anything
2251 */
2252 tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
2253 if (tcpp == NULL) {
2254 DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
2255 ctdb_addr_to_str(&conn->dst),
2256 ntohs(conn->dst.ip.sin_port)));
2257 return;
2258 }
2259
2260
2261 /* We need to remove this entry from the array.
2262 Instead of allocating a new array and copying data to it
2263 we cheat and just copy the last entry in the existing array
2264 to the entry that is to be removed and just shring the
2265 ->num field
2266 */
2267 *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
2268 vnn->tcp_array->num--;
2269
2270 /* If we deleted the last entry we also need to remove the entire array
2271 */
2272 if (vnn->tcp_array->num == 0) {
2273 talloc_free(vnn->tcp_array);
2274 vnn->tcp_array = NULL;
2275 }
2276
2277 vnn->tcp_update_needed = true;
2278
2279 DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
2280 ctdb_addr_to_str(&conn->src),
2281 ntohs(conn->src.ip.sin_port)));
2282}
2283
2284
2285/*
2286 called by a daemon to inform us of a TCP connection that one of its
2287 clients used are no longer needed in the tickle database
2288 */
2289int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
2290{
2291 struct ctdb_vnn *vnn;
2292 struct ctdb_connection *conn = (struct ctdb_connection *)indata.dptr;
2293
2294 /* If we don't have public IPs, tickles are useless */
2295 if (ctdb->vnn == NULL) {
2296 return 0;
2297 }
2298
2299 vnn = find_public_ip_vnn(ctdb, &conn->dst);
2300 if (vnn == NULL) {
2301 DEBUG(DEBUG_ERR,
2302 (__location__ " unable to find public address %s\n",
2303 ctdb_addr_to_str(&conn->dst)));
2304 return 0;
2305 }
2306
2307 ctdb_remove_connection(vnn, conn);
2308
2309 return 0;
2310}
2311
2312
2313/*
2314 Called when another daemon starts - causes all tickles for all
2315 public addresses we are serving to be sent to the new node on the
2316 next check. This actually causes the next scheduled call to
2317 tdb_update_tcp_tickles() to update all nodes. This is simple and
2318 doesn't require careful error handling.
2319 */
2320int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t pnn)
2321{
2322 struct ctdb_vnn *vnn;
2323
2324 DEBUG(DEBUG_INFO, ("Received startup control from node %lu\n",
2325 (unsigned long) pnn));
2326
2327 for (vnn = ctdb->vnn; vnn != NULL; vnn = vnn->next) {
2328 vnn->tcp_update_needed = true;
2329 }
2330
2331 return 0;
2332}
2333
2334
2335/*
2336 called when a client structure goes away - hook to remove
2337 elements from the tcp_list in all daemons
2338 */
2339void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
2340{
2341 while (client->tcp_list) {
2342 struct ctdb_vnn *vnn;
2343 struct ctdb_tcp_list *tcp = client->tcp_list;
2344 struct ctdb_connection *conn = &tcp->connection;
2345
2346 DLIST_REMOVE(client->tcp_list, tcp);
2347
2348 vnn = find_public_ip_vnn(client->ctdb,
2349 &conn->dst);
2350 if (vnn == NULL) {
2351 DEBUG(DEBUG_ERR,
2352 (__location__ " unable to find public address %s\n",
2353 ctdb_addr_to_str(&conn->dst)));
2354 continue;
2355 }
2356
2357 /* If the IP address is hosted on this node then
2358 * remove the connection. */
2359 if (vnn->pnn == client->ctdb->pnn) {
2360 ctdb_remove_connection(vnn, conn);
2361 }
2362
2363 /* Otherwise this function has been called because the
2364 * server IP address has been released to another node
2365 * and the client has exited. This means that we
2366 * should not delete the connection information. The
2367 * takeover node processes connections too. */
2368 }
2369}
2370
2371
2372void ctdb_release_all_ips(struct ctdb_context *ctdb)
2373{
2374 struct ctdb_vnn *vnn, *next;
2375 int count = 0;
2376
2377 if (ctdb->tunable.disable_ip_failover == 1) {
2378 return;
2379 }
2380
2381 for (vnn = ctdb->vnn; vnn != NULL; vnn = next) {
2382 /* vnn can be freed below in release_ip_post() */
2383 next = vnn->next;
2384
2385 if (!ctdb_sys_have_ip(&vnn->public_address)) {
2386 ctdb_vnn_unassign_iface(ctdb, vnn);
2387 continue;
2388 }
2389
2390 /* Don't allow multiple releases at once. Some code,
2391 * particularly ctdb_tickle_sentenced_connections() is
2392 * not re-entrant */
2393 if (vnn->update_in_flight) {
2394 DEBUG(DEBUG_WARNING,
2395 (__location__
2396 " Not releasing IP %s/%u on interface %s, an update is already in progess\n",
2397 ctdb_addr_to_str(&vnn->public_address),
2398 vnn->public_netmask_bits,
2399 ctdb_vnn_iface_string(vnn)));
2400 continue;
2401 }
2402 vnn->update_in_flight = true;
2403
2404 DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
2405 ctdb_addr_to_str(&vnn->public_address),
2406 vnn->public_netmask_bits,
2407 ctdb_vnn_iface_string(vnn)));
2408
2409 ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
2410 ctdb_vnn_iface_string(vnn),
2411 ctdb_addr_to_str(&vnn->public_address),
2412 vnn->public_netmask_bits);
2413 /* releaseip timeouts are converted to success, so to
2414 * detect failures just check if the IP address is
2415 * still there...
2416 */
2417 if (ctdb_sys_have_ip(&vnn->public_address)) {
2418 DEBUG(DEBUG_ERR,
2419 (__location__
2420 " IP address %s not released\n",
2421 ctdb_addr_to_str(&vnn->public_address)));
2422 vnn->update_in_flight = false;
2423 continue;
2424 }
2425
2426 vnn = release_ip_post(ctdb, vnn, &vnn->public_address);
2427 if (vnn != NULL) {
2428 vnn->update_in_flight = false;
2429 }
2430 count++;
2431 }
2432
2433 DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
2434}
2435
2436
2437/*
2438 get list of public IPs
2439 */
2440int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb,
2441 struct ctdb_req_control_old *c, TDB_DATA *outdata)
2442{
2443 int i, num, len;
2444 struct ctdb_public_ip_list_old *ips;
2445 struct ctdb_vnn *vnn;
2446 bool only_available = false;
2447
2448 if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
2449 only_available = true;
2450 }
2451
2452 /* count how many public ip structures we have */
2453 num = 0;
2454 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2455 num++;
2456 }
2457
2458 len = offsetof(struct ctdb_public_ip_list_old, ips) +
2459 num*sizeof(struct ctdb_public_ip);
2460 ips = talloc_zero_size(outdata, len);
2461 CTDB_NO_MEMORY(ctdb, ips);
2462
2463 i = 0;
2464 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
2465 if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
2466 continue;
2467 }
2468 ips->ips[i].pnn = vnn->pnn;
2469 ips->ips[i].addr = vnn->public_address;
2470 i++;
2471 }
2472 ips->num = i;
2473 len = offsetof(struct ctdb_public_ip_list_old, ips) +
2474 i*sizeof(struct ctdb_public_ip);
2475
2476 outdata->dsize = len;
2477 outdata->dptr = (uint8_t *)ips;
2478
2479 return 0;
2480}
2481
2482
2483int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
2484 struct ctdb_req_control_old *c,
2485 TDB_DATA indata,
2486 TDB_DATA *outdata)
2487{
2488 int i, num, len;
2489 ctdb_sock_addr *addr;
2490 struct ctdb_public_ip_info_old *info;
2491 struct ctdb_vnn *vnn;
2492
2493 addr = (ctdb_sock_addr *)indata.dptr;
2494
2495 vnn = find_public_ip_vnn(ctdb, addr);
2496 if (vnn == NULL) {
2497 /* if it is not a public ip it could be our 'single ip' */
2498 if (ctdb->single_ip_vnn) {
2499 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
2500 vnn = ctdb->single_ip_vnn;
2501 }
2502 }
2503 }
2504 if (vnn == NULL) {
2505 DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
2506 "'%s'not a public address\n",
2507 ctdb_addr_to_str(addr)));
2508 return -1;
2509 }
2510
2511 /* count how many public ip structures we have */
2512 num = 0;
2513 for (;vnn->ifaces[num];) {
2514 num++;
2515 }
2516
2517 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2518 num*sizeof(struct ctdb_iface);
2519 info = talloc_zero_size(outdata, len);
2520 CTDB_NO_MEMORY(ctdb, info);
2521
2522 info->ip.addr = vnn->public_address;
2523 info->ip.pnn = vnn->pnn;
2524 info->active_idx = 0xFFFFFFFF;
2525
2526 for (i=0; vnn->ifaces[i]; i++) {
2527 struct ctdb_interface *cur;
2528
2529 cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
2530 if (cur == NULL) {
2531 DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
2532 vnn->ifaces[i]));
2533 return -1;
2534 }
2535 if (vnn->iface == cur) {
2536 info->active_idx = i;
2537 }
2538 strncpy(info->ifaces[i].name, cur->name, sizeof(info->ifaces[i].name)-1);
2539 info->ifaces[i].link_state = cur->link_up;
2540 info->ifaces[i].references = cur->references;
2541 }
2542 info->num = i;
2543 len = offsetof(struct ctdb_public_ip_info_old, ifaces) +
2544 i*sizeof(struct ctdb_iface);
2545
2546 outdata->dsize = len;
2547 outdata->dptr = (uint8_t *)info;
2548
2549 return 0;
2550}
2551
2552int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
2553 struct ctdb_req_control_old *c,
2554 TDB_DATA *outdata)
2555{
2556 int i, num, len;
2557 struct ctdb_iface_list_old *ifaces;
2558 struct ctdb_interface *cur;
2559
2560 /* count how many public ip structures we have */
2561 num = 0;
2562 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2563 num++;
2564 }
2565
2566 len = offsetof(struct ctdb_iface_list_old, ifaces) +
2567 num*sizeof(struct ctdb_iface);
2568 ifaces = talloc_zero_size(outdata, len);
2569 CTDB_NO_MEMORY(ctdb, ifaces);
2570
2571 i = 0;
2572 for (cur=ctdb->ifaces;cur;cur=cur->next) {
2573 strcpy(ifaces->ifaces[i].name, cur->name);
2574 ifaces->ifaces[i].link_state = cur->link_up;
2575 ifaces->ifaces[i].references = cur->references;
2576 i++;
2577 }
2578 ifaces->num = i;
2579 len = offsetof(struct ctdb_iface_list_old, ifaces) +
2580 i*sizeof(struct ctdb_iface);
2581
2582 outdata->dsize = len;
2583 outdata->dptr = (uint8_t *)ifaces;
2584
2585 return 0;
2586}
2587
2588int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
2589 struct ctdb_req_control_old *c,
2590 TDB_DATA indata)
2591{
2592 struct ctdb_iface *info;
2593 struct ctdb_interface *iface;
2594 bool link_up = false;
2595
2596 info = (struct ctdb_iface *)indata.dptr;
2597
2598 if (info->name[CTDB_IFACE_SIZE] != '\0') {
2599 int len = strnlen(info->name, CTDB_IFACE_SIZE);
2600 DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
2601 len, len, info->name));
2602 return -1;
2603 }
2604
2605 switch (info->link_state) {
2606 case 0:
2607 link_up = false;
2608 break;
2609 case 1:
2610 link_up = true;
2611 break;
2612 default:
2613 DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
2614 (unsigned int)info->link_state));
2615 return -1;
2616 }
2617
2618 if (info->references != 0) {
2619 DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
2620 (unsigned int)info->references));
2621 return -1;
2622 }
2623
2624 iface = ctdb_find_iface(ctdb, info->name);
2625 if (iface == NULL) {
2626 return -1;
2627 }
2628
2629 if (link_up == iface->link_up) {
2630 return 0;
2631 }
2632
2633 DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
2634 ("iface[%s] has changed it's link status %s => %s\n",
2635 iface->name,
2636 iface->link_up?"up":"down",
2637 link_up?"up":"down"));
2638
2639 iface->link_up = link_up;
2640 return 0;
2641}
2642
2643
2644/*
2645 structure containing the listening socket and the list of tcp connections
2646 that the ctdb daemon is to kill
2647*/
2648struct ctdb_kill_tcp {
2649 struct ctdb_vnn *vnn;
2650 struct ctdb_context *ctdb;
2651 int capture_fd;
2652 struct tevent_fd *fde;
2653 trbt_tree_t *connections;
2654 void *private_data;
2655};
2656
2657/*
2658 a tcp connection that is to be killed
2659 */
2660struct ctdb_killtcp_con {
2661 ctdb_sock_addr src_addr;
2662 ctdb_sock_addr dst_addr;
2663 int count;
2664 struct ctdb_kill_tcp *killtcp;
2665};
2666
2667/* this function is used to create a key to represent this socketpair
2668 in the killtcp tree.
2669 this key is used to insert and lookup matching socketpairs that are
2670 to be tickled and RST
2671*/
2672#define KILLTCP_KEYLEN 10
2673static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
2674{
2675 static uint32_t key[KILLTCP_KEYLEN];
2676
2677 bzero(key, sizeof(key));
2678
2679 if (src->sa.sa_family != dst->sa.sa_family) {
2680 DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
2681 return key;
2682 }
2683
2684 switch (src->sa.sa_family) {
2685 case AF_INET:
2686 key[0] = dst->ip.sin_addr.s_addr;
2687 key[1] = src->ip.sin_addr.s_addr;
2688 key[2] = dst->ip.sin_port;
2689 key[3] = src->ip.sin_port;
2690 break;
2691 case AF_INET6: {
2692 uint32_t *dst6_addr32 =
2693 (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
2694 uint32_t *src6_addr32 =
2695 (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
2696 key[0] = dst6_addr32[3];
2697 key[1] = src6_addr32[3];
2698 key[2] = dst6_addr32[2];
2699 key[3] = src6_addr32[2];
2700 key[4] = dst6_addr32[1];
2701 key[5] = src6_addr32[1];
2702 key[6] = dst6_addr32[0];
2703 key[7] = src6_addr32[0];
2704 key[8] = dst->ip6.sin6_port;
2705 key[9] = src->ip6.sin6_port;
2706 break;
2707 }
2708 default:
2709 DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
2710 return key;
2711 }
2712
2713 return key;
2714}
2715
2716/*
2717 called when we get a read event on the raw socket
2718 */
2719static void capture_tcp_handler(struct tevent_context *ev,
2720 struct tevent_fd *fde,
2721 uint16_t flags, void *private_data)
2722{
2723 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2724 struct ctdb_killtcp_con *con;
2725 ctdb_sock_addr src, dst;
2726 uint32_t ack_seq, seq;
2727
2728 if (!(flags & TEVENT_FD_READ)) {
2729 return;
2730 }
2731
2732 if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
2733 killtcp->private_data,
2734 &src, &dst,
2735 &ack_seq, &seq) != 0) {
2736 /* probably a non-tcp ACK packet */
2737 return;
2738 }
2739
2740 /* check if we have this guy in our list of connections
2741 to kill
2742 */
2743 con = trbt_lookuparray32(killtcp->connections,
2744 KILLTCP_KEYLEN, killtcp_key(&src, &dst));
2745 if (con == NULL) {
2746 /* no this was some other packet we can just ignore */
2747 return;
2748 }
2749
2750 /* This one has been tickled !
2751 now reset him and remove him from the list.
2752 */
2753 DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
2754 ntohs(con->dst_addr.ip.sin_port),
2755 ctdb_addr_to_str(&con->src_addr),
2756 ntohs(con->src_addr.ip.sin_port)));
2757
2758 ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
2759 talloc_free(con);
2760}
2761
2762
2763/* when traversing the list of all tcp connections to send tickle acks to
2764 (so that we can capture the ack coming back and kill the connection
2765 by a RST)
2766 this callback is called for each connection we are currently trying to kill
2767*/
2768static int tickle_connection_traverse(void *param, void *data)
2769{
2770 struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
2771
2772 /* have tried too many times, just give up */
2773 if (con->count >= 5) {
2774 /* can't delete in traverse: reparent to delete_cons */
2775 talloc_steal(param, con);
2776 return 0;
2777 }
2778
2779 /* othervise, try tickling it again */
2780 con->count++;
2781 ctdb_sys_send_tcp(
2782 (ctdb_sock_addr *)&con->dst_addr,
2783 (ctdb_sock_addr *)&con->src_addr,
2784 0, 0, 0);
2785 return 0;
2786}
2787
2788
2789/*
2790 called every second until all sentenced connections have been reset
2791 */
2792static void ctdb_tickle_sentenced_connections(struct tevent_context *ev,
2793 struct tevent_timer *te,
2794 struct timeval t, void *private_data)
2795{
2796 struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
2797 void *delete_cons = talloc_new(NULL);
2798
2799 /* loop over all connections sending tickle ACKs */
2800 trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
2801
2802 /* now we've finished traverse, it's safe to do deletion. */
2803 talloc_free(delete_cons);
2804
2805 /* If there are no more connections to kill we can remove the
2806 entire killtcp structure
2807 */
2808 if ( (killtcp->connections == NULL) ||
2809 (killtcp->connections->root == NULL) ) {
2810 talloc_free(killtcp);
2811 return;
2812 }
2813
2814 /* try tickling them again in a seconds time
2815 */
2816 tevent_add_timer(killtcp->ctdb->ev, killtcp,
2817 timeval_current_ofs(1, 0),
2818 ctdb_tickle_sentenced_connections, killtcp);
2819}
2820
2821/*
2822 destroy the killtcp structure
2823 */
2824static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
2825{
2826 struct ctdb_vnn *tmpvnn;
2827
2828 /* verify that this vnn is still active */
2829 for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
2830 if (tmpvnn == killtcp->vnn) {
2831 break;
2832 }
2833 }
2834
2835 if (tmpvnn == NULL) {
2836 return 0;
2837 }
2838
2839 if (killtcp->vnn->killtcp != killtcp) {
2840 return 0;
2841 }
2842
2843 killtcp->vnn->killtcp = NULL;
2844
2845 return 0;
2846}
2847
2848
2849/* nothing fancy here, just unconditionally replace any existing
2850 connection structure with the new one.
2851
2852 don't even free the old one if it did exist, that one is talloc_stolen
2853 by the same node in the tree anyway and will be deleted when the new data
2854 is deleted
2855*/
2856static void *add_killtcp_callback(void *parm, void *data)
2857{
2858 return parm;
2859}
2860
2861/*
2862 add a tcp socket to the list of connections we want to RST
2863 */
2864static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb,
2865 ctdb_sock_addr *s,
2866 ctdb_sock_addr *d)
2867{
2868 ctdb_sock_addr src, dst;
2869 struct ctdb_kill_tcp *killtcp;
2870 struct ctdb_killtcp_con *con;
2871 struct ctdb_vnn *vnn;
2872
2873 ctdb_canonicalize_ip(s, &src);
2874 ctdb_canonicalize_ip(d, &dst);
2875
2876 vnn = find_public_ip_vnn(ctdb, &dst);
2877 if (vnn == NULL) {
2878 vnn = find_public_ip_vnn(ctdb, &src);
2879 }
2880 if (vnn == NULL) {
2881 /* if it is not a public ip it could be our 'single ip' */
2882 if (ctdb->single_ip_vnn) {
2883 if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
2884 vnn = ctdb->single_ip_vnn;
2885 }
2886 }
2887 }
2888 if (vnn == NULL) {
2889 DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n"));
2890 return -1;
2891 }
2892
2893 killtcp = vnn->killtcp;
2894
2895 /* If this is the first connection to kill we must allocate
2896 a new structure
2897 */
2898 if (killtcp == NULL) {
2899 killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
2900 CTDB_NO_MEMORY(ctdb, killtcp);
2901
2902 killtcp->vnn = vnn;
2903 killtcp->ctdb = ctdb;
2904 killtcp->capture_fd = -1;
2905 killtcp->connections = trbt_create(killtcp, 0);
2906
2907 vnn->killtcp = killtcp;
2908 talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
2909 }
2910
2911
2912
2913 /* create a structure that describes this connection we want to
2914 RST and store it in killtcp->connections
2915 */
2916 con = talloc(killtcp, struct ctdb_killtcp_con);
2917 CTDB_NO_MEMORY(ctdb, con);
2918 con->src_addr = src;
2919 con->dst_addr = dst;
2920 con->count = 0;
2921 con->killtcp = killtcp;
2922
2923
2924 trbt_insertarray32_callback(killtcp->connections,
2925 KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
2926 add_killtcp_callback, con);
2927
2928 /*
2929 If we don't have a socket to listen on yet we must create it
2930 */
2931 if (killtcp->capture_fd == -1) {
2932 const char *iface = ctdb_vnn_iface_string(vnn);
2933 killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
2934 if (killtcp->capture_fd == -1) {
2935 DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
2936 "socket on iface '%s' for killtcp (%s)\n",
2937 iface, strerror(errno)));
2938 goto failed;
2939 }
2940 }
2941
2942
2943 if (killtcp->fde == NULL) {
2944 killtcp->fde = tevent_add_fd(ctdb->ev, killtcp,
2945 killtcp->capture_fd,
2946 TEVENT_FD_READ,
2947 capture_tcp_handler, killtcp);
2948 tevent_fd_set_auto_close(killtcp->fde);
2949
2950 /* We also need to set up some events to tickle all these connections
2951 until they are all reset
2952 */
2953 tevent_add_timer(ctdb->ev, killtcp, timeval_current_ofs(1, 0),
2954 ctdb_tickle_sentenced_connections, killtcp);
2955 }
2956
2957 /* tickle him once now */
2958 ctdb_sys_send_tcp(
2959 &con->dst_addr,
2960 &con->src_addr,
2961 0, 0, 0);
2962
2963 return 0;
2964
2965failed:
2966 talloc_free(vnn->killtcp);
2967 vnn->killtcp = NULL;
2968 return -1;
2969}
2970
2971/*
2972 kill a TCP connection.
2973 */
2974int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
2975{
2976 struct ctdb_connection *killtcp = (struct ctdb_connection *)indata.dptr;
2977
2978 return ctdb_killtcp_add_connection(ctdb, &killtcp->src, &killtcp->dst);
2979}
2980
2981/*
2982 called by a daemon to inform us of the entire list of TCP tickles for
2983 a particular public address.
2984 this control should only be sent by the node that is currently serving
2985 that public address.
2986 */
2987int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
2988{
2989 struct ctdb_tickle_list_old *list = (struct ctdb_tickle_list_old *)indata.dptr;
2990 struct ctdb_tcp_array *tcparray;
2991 struct ctdb_vnn *vnn;
2992
2993 /* We must at least have tickles.num or else we cant verify the size
2994 of the received data blob
2995 */
2996 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)) {
2997 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list. Not enough data for the tickle.num field\n"));
2998 return -1;
2999 }
3000
3001 /* verify that the size of data matches what we expect */
3002 if (indata.dsize < offsetof(struct ctdb_tickle_list_old, connections)
3003 + sizeof(struct ctdb_connection) * list->num) {
3004 DEBUG(DEBUG_ERR,("Bad indata in ctdb_tickle_list\n"));
3005 return -1;
3006 }
3007
3008 DEBUG(DEBUG_INFO, ("Received tickle update for public address %s\n",
3009 ctdb_addr_to_str(&list->addr)));
3010
3011 vnn = find_public_ip_vnn(ctdb, &list->addr);
3012 if (vnn == NULL) {
3013 DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n",
3014 ctdb_addr_to_str(&list->addr)));
3015
3016 return 1;
3017 }
3018
3019 if (vnn->pnn == ctdb->pnn) {
3020 DEBUG(DEBUG_INFO,
3021 ("Ignoring redundant set tcp tickle list, this node hosts '%s'\n",
3022 ctdb_addr_to_str(&list->addr)));
3023 return 0;
3024 }
3025
3026 /* remove any old ticklelist we might have */
3027 talloc_free(vnn->tcp_array);
3028 vnn->tcp_array = NULL;
3029
3030 tcparray = talloc(vnn, struct ctdb_tcp_array);
3031 CTDB_NO_MEMORY(ctdb, tcparray);
3032
3033 tcparray->num = list->num;
3034
3035 tcparray->connections = talloc_array(tcparray, struct ctdb_connection, tcparray->num);
3036 CTDB_NO_MEMORY(ctdb, tcparray->connections);
3037
3038 memcpy(tcparray->connections, &list->connections[0],
3039 sizeof(struct ctdb_connection)*tcparray->num);
3040
3041 /* We now have a new fresh tickle list array for this vnn */
3042 vnn->tcp_array = tcparray;
3043
3044 return 0;
3045}
3046
3047/*
3048 called to return the full list of tickles for the puclic address associated
3049 with the provided vnn
3050 */
3051int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
3052{
3053 ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
3054 struct ctdb_tickle_list_old *list;
3055 struct ctdb_tcp_array *tcparray;
3056 int num;
3057 struct ctdb_vnn *vnn;
3058
3059 vnn = find_public_ip_vnn(ctdb, addr);
3060 if (vnn == NULL) {
3061 DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n",
3062 ctdb_addr_to_str(addr)));
3063
3064 return 1;
3065 }
3066
3067 tcparray = vnn->tcp_array;
3068 if (tcparray) {
3069 num = tcparray->num;
3070 } else {
3071 num = 0;
3072 }
3073
3074 outdata->dsize = offsetof(struct ctdb_tickle_list_old, connections)
3075 + sizeof(struct ctdb_connection) * num;
3076
3077 outdata->dptr = talloc_size(outdata, outdata->dsize);
3078 CTDB_NO_MEMORY(ctdb, outdata->dptr);
3079 list = (struct ctdb_tickle_list_old *)outdata->dptr;
3080
3081 list->addr = *addr;
3082 list->num = num;
3083 if (num) {
3084 memcpy(&list->connections[0], tcparray->connections,
3085 sizeof(struct ctdb_connection) * num);
3086 }
3087
3088 return 0;
3089}
3090
3091
3092/*
3093 set the list of all tcp tickles for a public address
3094 */
3095static int ctdb_send_set_tcp_tickles_for_ip(struct ctdb_context *ctdb,
3096 ctdb_sock_addr *addr,
3097 struct ctdb_tcp_array *tcparray)
3098{
3099 int ret, num;
3100 TDB_DATA data;
3101 struct ctdb_tickle_list_old *list;
3102
3103 if (tcparray) {
3104 num = tcparray->num;
3105 } else {
3106 num = 0;
3107 }
3108
3109 data.dsize = offsetof(struct ctdb_tickle_list_old, connections) +
3110 sizeof(struct ctdb_connection) * num;
3111 data.dptr = talloc_size(ctdb, data.dsize);
3112 CTDB_NO_MEMORY(ctdb, data.dptr);
3113
3114 list = (struct ctdb_tickle_list_old *)data.dptr;
3115 list->addr = *addr;
3116 list->num = num;
3117 if (tcparray) {
3118 memcpy(&list->connections[0], tcparray->connections, sizeof(struct ctdb_connection) * num);
3119 }
3120
3121 ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, 0,
3122 CTDB_CONTROL_SET_TCP_TICKLE_LIST,
3123 0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
3124 if (ret != 0) {
3125 DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
3126 return -1;
3127 }
3128
3129 talloc_free(data.dptr);
3130
3131 return ret;
3132}
3133
3134
3135/*
3136 perform tickle updates if required
3137 */
3138static void ctdb_update_tcp_tickles(struct tevent_context *ev,
3139 struct tevent_timer *te,
3140 struct timeval t, void *private_data)
3141{
3142 struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
3143 int ret;
3144 struct ctdb_vnn *vnn;
3145
3146 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3147 /* we only send out updates for public addresses that
3148 we have taken over
3149 */
3150 if (ctdb->pnn != vnn->pnn) {
3151 continue;
3152 }
3153 /* We only send out the updates if we need to */
3154 if (!vnn->tcp_update_needed) {
3155 continue;
3156 }
3157 ret = ctdb_send_set_tcp_tickles_for_ip(ctdb,
3158 &vnn->public_address,
3159 vnn->tcp_array);
3160 if (ret != 0) {
3161 DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
3162 ctdb_addr_to_str(&vnn->public_address)));
3163 } else {
3164 DEBUG(DEBUG_INFO,
3165 ("Sent tickle update for public address %s\n",
3166 ctdb_addr_to_str(&vnn->public_address)));
3167 vnn->tcp_update_needed = false;
3168 }
3169 }
3170
3171 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3172 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3173 ctdb_update_tcp_tickles, ctdb);
3174}
3175
3176/*
3177 start periodic update of tcp tickles
3178 */
3179void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
3180{
3181 ctdb->tickle_update_context = talloc_new(ctdb);
3182
3183 tevent_add_timer(ctdb->ev, ctdb->tickle_update_context,
3184 timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0),
3185 ctdb_update_tcp_tickles, ctdb);
3186}
3187
3188
3189
3190
3191struct control_gratious_arp {
3192 struct ctdb_context *ctdb;
3193 ctdb_sock_addr addr;
3194 const char *iface;
3195 int count;
3196};
3197
3198/*
3199 send a control_gratuitous arp
3200 */
3201static void send_gratious_arp(struct tevent_context *ev,
3202 struct tevent_timer *te,
3203 struct timeval t, void *private_data)
3204{
3205 int ret;
3206 struct control_gratious_arp *arp = talloc_get_type(private_data,
3207 struct control_gratious_arp);
3208
3209 ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
3210 if (ret != 0) {
3211 DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
3212 arp->iface, strerror(errno)));
3213 }
3214
3215
3216 arp->count++;
3217 if (arp->count == CTDB_ARP_REPEAT) {
3218 talloc_free(arp);
3219 return;
3220 }
3221
3222 tevent_add_timer(arp->ctdb->ev, arp,
3223 timeval_current_ofs(CTDB_ARP_INTERVAL, 0),
3224 send_gratious_arp, arp);
3225}
3226
3227
3228/*
3229 send a gratious arp
3230 */
3231int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
3232{
3233 struct ctdb_addr_info_old *gratious_arp = (struct ctdb_addr_info_old *)indata.dptr;
3234 struct control_gratious_arp *arp;
3235
3236 /* verify the size of indata */
3237 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3238 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n",
3239 (unsigned)indata.dsize,
3240 (unsigned)offsetof(struct ctdb_addr_info_old, iface)));
3241 return -1;
3242 }
3243 if (indata.dsize !=
3244 ( offsetof(struct ctdb_addr_info_old, iface)
3245 + gratious_arp->len ) ){
3246
3247 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3248 "but should be %u bytes\n",
3249 (unsigned)indata.dsize,
3250 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+gratious_arp->len)));
3251 return -1;
3252 }
3253
3254
3255 arp = talloc(ctdb, struct control_gratious_arp);
3256 CTDB_NO_MEMORY(ctdb, arp);
3257
3258 arp->ctdb = ctdb;
3259 arp->addr = gratious_arp->addr;
3260 arp->iface = talloc_strdup(arp, gratious_arp->iface);
3261 CTDB_NO_MEMORY(ctdb, arp->iface);
3262 arp->count = 0;
3263
3264 tevent_add_timer(arp->ctdb->ev, arp,
3265 timeval_zero(), send_gratious_arp, arp);
3266
3267 return 0;
3268}
3269
3270int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
3271{
3272 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3273 int ret;
3274
3275 /* verify the size of indata */
3276 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3277 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3278 return -1;
3279 }
3280 if (indata.dsize !=
3281 ( offsetof(struct ctdb_addr_info_old, iface)
3282 + pub->len ) ){
3283
3284 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3285 "but should be %u bytes\n",
3286 (unsigned)indata.dsize,
3287 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3288 return -1;
3289 }
3290
3291 DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
3292
3293 ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
3294
3295 if (ret != 0) {
3296 DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
3297 return -1;
3298 }
3299
3300 return 0;
3301}
3302
3303struct delete_ip_callback_state {
3304 struct ctdb_req_control_old *c;
3305};
3306
3307/*
3308 called when releaseip event finishes for del_public_address
3309 */
3310static void delete_ip_callback(struct ctdb_context *ctdb,
3311 int32_t status, TDB_DATA data,
3312 const char *errormsg,
3313 void *private_data)
3314{
3315 struct delete_ip_callback_state *state =
3316 talloc_get_type(private_data, struct delete_ip_callback_state);
3317
3318 /* If release failed then fail. */
3319 ctdb_request_control_reply(ctdb, state->c, NULL, status, errormsg);
3320 talloc_free(private_data);
3321}
3322
3323int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb,
3324 struct ctdb_req_control_old *c,
3325 TDB_DATA indata, bool *async_reply)
3326{
3327 struct ctdb_addr_info_old *pub = (struct ctdb_addr_info_old *)indata.dptr;
3328 struct ctdb_vnn *vnn;
3329
3330 /* verify the size of indata */
3331 if (indata.dsize < offsetof(struct ctdb_addr_info_old, iface)) {
3332 DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_addr_info structure\n"));
3333 return -1;
3334 }
3335 if (indata.dsize !=
3336 ( offsetof(struct ctdb_addr_info_old, iface)
3337 + pub->len ) ){
3338
3339 DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
3340 "but should be %u bytes\n",
3341 (unsigned)indata.dsize,
3342 (unsigned)(offsetof(struct ctdb_addr_info_old, iface)+pub->len)));
3343 return -1;
3344 }
3345
3346 DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
3347
3348 /* walk over all public addresses until we find a match */
3349 for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
3350 if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
3351 if (vnn->pnn == ctdb->pnn) {
3352 struct delete_ip_callback_state *state;
3353 struct ctdb_public_ip *ip;
3354 TDB_DATA data;
3355 int ret;
3356
3357 vnn->delete_pending = true;
3358
3359 state = talloc(ctdb,
3360 struct delete_ip_callback_state);
3361 CTDB_NO_MEMORY(ctdb, state);
3362 state->c = c;
3363
3364 ip = talloc(state, struct ctdb_public_ip);
3365 if (ip == NULL) {
3366 DEBUG(DEBUG_ERR,
3367 (__location__ " Out of memory\n"));
3368 talloc_free(state);
3369 return -1;
3370 }
3371 ip->pnn = -1;
3372 ip->addr = pub->addr;
3373
3374 data.dsize = sizeof(struct ctdb_public_ip);
3375 data.dptr = (unsigned char *)ip;
3376
3377 ret = ctdb_daemon_send_control(ctdb,
3378 ctdb_get_pnn(ctdb),
3379 0,
3380 CTDB_CONTROL_RELEASE_IP,
3381 0, 0,
3382 data,
3383 delete_ip_callback,
3384 state);
3385 if (ret == -1) {
3386 DEBUG(DEBUG_ERR,
3387 (__location__ "Unable to send "
3388 "CTDB_CONTROL_RELEASE_IP\n"));
3389 talloc_free(state);
3390 return -1;
3391 }
3392
3393 state->c = talloc_steal(state, c);
3394 *async_reply = true;
3395 } else {
3396 /* This IP is not hosted on the
3397 * current node so just delete it
3398 * now. */
3399 do_delete_ip(ctdb, vnn);
3400 }
3401
3402 return 0;
3403 }
3404 }
3405
3406 DEBUG(DEBUG_ERR,("Delete IP of unknown public IP address %s\n",
3407 ctdb_addr_to_str(&pub->addr)));
3408 return -1;
3409}
3410
3411
3412struct ipreallocated_callback_state {
3413 struct ctdb_req_control_old *c;
3414};
3415
3416static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
3417 int status, void *p)
3418{
3419 struct ipreallocated_callback_state *state =
3420 talloc_get_type(p, struct ipreallocated_callback_state);
3421
3422 if (status != 0) {
3423 DEBUG(DEBUG_ERR,
3424 (" \"ipreallocated\" event script failed (status %d)\n",
3425 status));
3426 if (status == -ETIME) {
3427 ctdb_ban_self(ctdb);
3428 }
3429 }
3430
3431 ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
3432 talloc_free(state);
3433}
3434
3435/* A control to run the ipreallocated event */
3436int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
3437 struct ctdb_req_control_old *c,
3438 bool *async_reply)
3439{
3440 int ret;
3441 struct ipreallocated_callback_state *state;
3442
3443 state = talloc(ctdb, struct ipreallocated_callback_state);
3444 CTDB_NO_MEMORY(ctdb, state);
3445
3446 DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
3447
3448 ret = ctdb_event_script_callback(ctdb, state,
3449 ctdb_ipreallocated_callback, state,
3450 CTDB_EVENT_IPREALLOCATED,
3451 "%s", "");
3452
3453 if (ret != 0) {
3454 DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
3455 talloc_free(state);
3456 return -1;
3457 }
3458
3459 /* tell the control that we will be reply asynchronously */
3460 state->c = talloc_steal(state, c);
3461 *async_reply = true;
3462
3463 return 0;
3464}
3465
3466
3467/* This function is called from the recovery daemon to verify that a remote
3468 node has the expected ip allocation.
3469 This is verified against ctdb->ip_tree
3470*/
3471static int verify_remote_ip_allocation(struct ctdb_context *ctdb,
3472 struct ctdb_public_ip_list_old *ips,
3473 uint32_t pnn)
3474{
3475 struct public_ip_list *tmp_ip;
3476 int i;
3477
3478 if (ctdb->ip_tree == NULL) {
3479 /* don't know the expected allocation yet, assume remote node
3480 is correct. */
3481 return 0;
3482 }
3483
3484 if (ips == NULL) {
3485 return 0;
3486 }
3487
3488 for (i=0; i<ips->num; i++) {
3489 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
3490 if (tmp_ip == NULL) {
3491 DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
3492 return -1;
3493 }
3494
3495 if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
3496 continue;
3497 }
3498
3499 if (tmp_ip->pnn != ips->ips[i].pnn) {
3500 DEBUG(DEBUG_ERR,
3501 ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
3502 pnn,
3503 ctdb_addr_to_str(&ips->ips[i].addr),
3504 ips->ips[i].pnn, tmp_ip->pnn));
3505 return -1;
3506 }
3507 }
3508
3509 return 0;
3510}
3511
3512int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
3513{
3514 struct public_ip_list *tmp_ip;
3515
3516 /* IP tree is never built if DisableIPFailover is set */
3517 if (ctdb->tunable.disable_ip_failover != 0) {
3518 return 0;
3519 }
3520
3521 if (ctdb->ip_tree == NULL) {
3522 DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
3523 return -1;
3524 }
3525
3526 tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
3527 if (tmp_ip == NULL) {
3528 DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
3529 return -1;
3530 }
3531
3532 DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
3533 tmp_ip->pnn = ip->pnn;
3534
3535 return 0;
3536}
3537
3538void clear_ip_assignment_tree(struct ctdb_context *ctdb)
3539{
3540 TALLOC_FREE(ctdb->ip_tree);
3541}
3542
3543struct ctdb_reloadips_handle {
3544 struct ctdb_context *ctdb;
3545 struct ctdb_req_control_old *c;
3546 int status;
3547 int fd[2];
3548 pid_t child;
3549 struct tevent_fd *fde;
3550};
3551
3552static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
3553{
3554 if (h == h->ctdb->reload_ips) {
3555 h->ctdb->reload_ips = NULL;
3556 }
3557 if (h->c != NULL) {
3558 ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
3559 h->c = NULL;
3560 }
3561 ctdb_kill(h->ctdb, h->child, SIGKILL);
3562 return 0;
3563}
3564
3565static void ctdb_reloadips_timeout_event(struct tevent_context *ev,
3566 struct tevent_timer *te,
3567 struct timeval t, void *private_data)
3568{
3569 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3570
3571 talloc_free(h);
3572}
3573
3574static void ctdb_reloadips_child_handler(struct tevent_context *ev,
3575 struct tevent_fd *fde,
3576 uint16_t flags, void *private_data)
3577{
3578 struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
3579
3580 char res;
3581 int ret;
3582
3583 ret = sys_read(h->fd[0], &res, 1);
3584 if (ret < 1 || res != 0) {
3585 DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
3586 res = 1;
3587 }
3588 h->status = res;
3589
3590 talloc_free(h);
3591}
3592
3593static int ctdb_reloadips_child(struct ctdb_context *ctdb)
3594{
3595 TALLOC_CTX *mem_ctx = talloc_new(NULL);
3596 struct ctdb_public_ip_list_old *ips;
3597 struct ctdb_vnn *vnn;
3598 struct client_async_data *async_data;
3599 struct timeval timeout;
3600 TDB_DATA data;
3601 struct ctdb_client_control_state *state;
3602 bool first_add;
3603 int i, ret;
3604
3605 CTDB_NO_MEMORY(ctdb, mem_ctx);
3606
3607 /* Read IPs from local node */
3608 ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
3609 CTDB_CURRENT_NODE, mem_ctx, &ips);
3610 if (ret != 0) {
3611 DEBUG(DEBUG_ERR,
3612 ("Unable to fetch public IPs from local node\n"));
3613 talloc_free(mem_ctx);
3614 return -1;
3615 }
3616
3617 /* Read IPs file - this is safe since this is a child process */
3618 ctdb->vnn = NULL;
3619 if (ctdb_set_public_addresses(ctdb, false) != 0) {
3620 DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
3621 talloc_free(mem_ctx);
3622 return -1;
3623 }
3624
3625 async_data = talloc_zero(mem_ctx, struct client_async_data);
3626 CTDB_NO_MEMORY(ctdb, async_data);
3627
3628 /* Compare IPs between node and file for IPs to be deleted */
3629 for (i = 0; i < ips->num; i++) {
3630 /* */
3631 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3632 if (ctdb_same_ip(&vnn->public_address,
3633 &ips->ips[i].addr)) {
3634 /* IP is still in file */
3635 break;
3636 }
3637 }
3638
3639 if (vnn == NULL) {
3640 /* Delete IP ips->ips[i] */
3641 struct ctdb_addr_info_old *pub;
3642
3643 DEBUG(DEBUG_NOTICE,
3644 ("IP %s no longer configured, deleting it\n",
3645 ctdb_addr_to_str(&ips->ips[i].addr)));
3646
3647 pub = talloc_zero(mem_ctx, struct ctdb_addr_info_old);
3648 CTDB_NO_MEMORY(ctdb, pub);
3649
3650 pub->addr = ips->ips[i].addr;
3651 pub->mask = 0;
3652 pub->len = 0;
3653
3654 timeout = TAKEOVER_TIMEOUT();
3655
3656 data.dsize = offsetof(struct ctdb_addr_info_old,
3657 iface) + pub->len;
3658 data.dptr = (uint8_t *)pub;
3659
3660 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3661 CTDB_CONTROL_DEL_PUBLIC_IP,
3662 0, data, async_data,
3663 &timeout, NULL);
3664 if (state == NULL) {
3665 DEBUG(DEBUG_ERR,
3666 (__location__
3667 " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
3668 goto failed;
3669 }
3670
3671 ctdb_client_async_add(async_data, state);
3672 }
3673 }
3674
3675 /* Compare IPs between node and file for IPs to be added */
3676 first_add = true;
3677 for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
3678 for (i = 0; i < ips->num; i++) {
3679 if (ctdb_same_ip(&vnn->public_address,
3680 &ips->ips[i].addr)) {
3681 /* IP already on node */
3682 break;
3683 }
3684 }
3685 if (i == ips->num) {
3686 /* Add IP ips->ips[i] */
3687 struct ctdb_addr_info_old *pub;
3688 const char *ifaces = NULL;
3689 uint32_t len;
3690 int iface = 0;
3691
3692 DEBUG(DEBUG_NOTICE,
3693 ("New IP %s configured, adding it\n",
3694 ctdb_addr_to_str(&vnn->public_address)));
3695 if (first_add) {
3696 uint32_t pnn = ctdb_get_pnn(ctdb);
3697
3698 data.dsize = sizeof(pnn);
3699 data.dptr = (uint8_t *)&pnn;
3700
3701 ret = ctdb_client_send_message(
3702 ctdb,
3703 CTDB_BROADCAST_CONNECTED,
3704 CTDB_SRVID_REBALANCE_NODE,
3705 data);
3706 if (ret != 0) {
3707 DEBUG(DEBUG_WARNING,
3708 ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
3709 }
3710
3711 first_add = false;
3712 }
3713
3714 ifaces = vnn->ifaces[0];
3715 iface = 1;
3716 while (vnn->ifaces[iface] != NULL) {
3717 ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
3718 vnn->ifaces[iface]);
3719 iface++;
3720 }
3721
3722 len = strlen(ifaces) + 1;
3723 pub = talloc_zero_size(mem_ctx,
3724 offsetof(struct ctdb_addr_info_old, iface) + len);
3725 CTDB_NO_MEMORY(ctdb, pub);
3726
3727 pub->addr = vnn->public_address;
3728 pub->mask = vnn->public_netmask_bits;
3729 pub->len = len;
3730 memcpy(&pub->iface[0], ifaces, pub->len);
3731
3732 timeout = TAKEOVER_TIMEOUT();
3733
3734 data.dsize = offsetof(struct ctdb_addr_info_old,
3735 iface) + pub->len;
3736 data.dptr = (uint8_t *)pub;
3737
3738 state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
3739 CTDB_CONTROL_ADD_PUBLIC_IP,
3740 0, data, async_data,
3741 &timeout, NULL);
3742 if (state == NULL) {
3743 DEBUG(DEBUG_ERR,
3744 (__location__
3745 " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
3746 goto failed;
3747 }
3748
3749 ctdb_client_async_add(async_data, state);
3750 }
3751 }
3752
3753 if (ctdb_client_async_wait(ctdb, async_data) != 0) {
3754 DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
3755 goto failed;
3756 }
3757
3758 talloc_free(mem_ctx);
3759 return 0;
3760
3761failed:
3762 talloc_free(mem_ctx);
3763 return -1;
3764}
3765
3766/* This control is sent to force the node to re-read the public addresses file
3767 and drop any addresses we should nnot longer host, and add new addresses
3768 that we are now able to host
3769*/
3770int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control_old *c, bool *async_reply)
3771{
3772 struct ctdb_reloadips_handle *h;
3773 pid_t parent = getpid();
3774
3775 if (ctdb->reload_ips != NULL) {
3776 talloc_free(ctdb->reload_ips);
3777 ctdb->reload_ips = NULL;
3778 }
3779
3780 h = talloc(ctdb, struct ctdb_reloadips_handle);
3781 CTDB_NO_MEMORY(ctdb, h);
3782 h->ctdb = ctdb;
3783 h->c = NULL;
3784 h->status = -1;
3785
3786 if (pipe(h->fd) == -1) {
3787 DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
3788 talloc_free(h);
3789 return -1;
3790 }
3791
3792 h->child = ctdb_fork(ctdb);
3793 if (h->child == (pid_t)-1) {
3794 DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
3795 close(h->fd[0]);
3796 close(h->fd[1]);
3797 talloc_free(h);
3798 return -1;
3799 }
3800
3801 /* child process */
3802 if (h->child == 0) {
3803 signed char res = 0;
3804
3805 close(h->fd[0]);
3806 debug_extra = talloc_asprintf(NULL, "reloadips:");
3807
3808 prctl_set_comment("ctdb_reloadips");
3809 if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
3810 DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
3811 res = -1;
3812 } else {
3813 res = ctdb_reloadips_child(ctdb);
3814 if (res != 0) {
3815 DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
3816 }
3817 }
3818
3819 sys_write(h->fd[1], &res, 1);
3820 /* make sure we die when our parent dies */
3821 while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
3822 sleep(5);
3823 }
3824 _exit(0);
3825 }
3826
3827 h->c = talloc_steal(h, c);
3828
3829 close(h->fd[1]);
3830 set_close_on_exec(h->fd[0]);
3831
3832 talloc_set_destructor(h, ctdb_reloadips_destructor);
3833
3834
3835 h->fde = tevent_add_fd(ctdb->ev, h, h->fd[0], TEVENT_FD_READ,
3836 ctdb_reloadips_child_handler, (void *)h);
3837 tevent_fd_set_auto_close(h->fde);
3838
3839 tevent_add_timer(ctdb->ev, h, timeval_current_ofs(120, 0),
3840 ctdb_reloadips_timeout_event, h);
3841
3842 /* we reply later */
3843 *async_reply = true;
3844 return 0;
3845}
Note: See TracBrowser for help on using the repository browser.