res_corosync: Recover if corosync gets restarted.
[asterisk/asterisk.git] / res / res_corosync.c
1 /*
2  * Asterisk -- An open source telephony toolkit.
3  *
4  * Copyright (C) 2007, Digium, Inc.
5  * Copyright (C) 2012, Russell Bryant
6  *
7  * Russell Bryant <russell@russellbryant.net>
8  *
9  * See http://www.asterisk.org for more information about
10  * the Asterisk project. Please do not directly contact
11  * any of the maintainers of this project for assistance;
12  * the project provides a web site, mailing lists and IRC
13  * channels for your use.
14  *
15  * This program is free software, distributed under the terms of
16  * the GNU General Public License Version 2. See the LICENSE file
17  * at the top of the source tree.
18  */
19
20 /*!
21  * \file
22  * \author Russell Bryant <russell@russellbryant.net>
23  *
24  * This module is based on and replaces the previous res_ais module.
25  */
26
27 /*** MODULEINFO
28         <depend>corosync</depend>
29         <support_level>extended</support_level>
30  ***/
31
32 #include "asterisk.h"
33
34 ASTERISK_FILE_VERSION(__FILE__, "$Revision$");
35
36 #include <corosync/cpg.h>
37 #include <corosync/cfg.h>
38
39 #include "asterisk/module.h"
40 #include "asterisk/logger.h"
41 #include "asterisk/poll-compat.h"
42 #include "asterisk/config.h"
43 #include "asterisk/event.h"
44 #include "asterisk/cli.h"
45 #include "asterisk/devicestate.h"
46
47 AST_RWLOCK_DEFINE_STATIC(event_types_lock);
48
49 static struct {
50         const char *name;
51         struct ast_event_sub *sub;
52         unsigned char publish;
53         unsigned char subscribe;
54 } event_types[] = {
55         [AST_EVENT_MWI] = { .name = "mwi", },
56         [AST_EVENT_DEVICE_STATE_CHANGE] = { .name = "device_state", },
57 };
58
59 static struct {
60         pthread_t id;
61         int alert_pipe[2];
62         unsigned int stop:1;
63 } dispatch_thread = {
64         .id = AST_PTHREADT_NULL,
65         .alert_pipe = { -1, -1 },
66 };
67
68 static cpg_handle_t cpg_handle;
69 static corosync_cfg_handle_t cfg_handle;
70
71 static void cfg_state_track_cb(
72                 corosync_cfg_state_notification_buffer_t *notification_buffer,
73                 cs_error_t error);
74
75 static void cfg_shutdown_cb(corosync_cfg_handle_t cfg_handle,
76                 corosync_cfg_shutdown_flags_t flags);
77
78 static corosync_cfg_callbacks_t cfg_callbacks = {
79         .corosync_cfg_state_track_callback = cfg_state_track_cb,
80         .corosync_cfg_shutdown_callback = cfg_shutdown_cb,
81 };
82
83 static void cpg_deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name,
84                 uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len);
85
86 static void cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name,
87                 const struct cpg_address *member_list, size_t member_list_entries,
88                 const struct cpg_address *left_list, size_t left_list_entries,
89                 const struct cpg_address *joined_list, size_t joined_list_entries);
90
91 static cpg_callbacks_t cpg_callbacks = {
92         .cpg_deliver_fn = cpg_deliver_cb,
93         .cpg_confchg_fn = cpg_confchg_cb,
94 };
95
96 static void ast_event_cb(const struct ast_event *event, void *data);
97
98 static void cfg_state_track_cb(
99                 corosync_cfg_state_notification_buffer_t *notification_buffer,
100                 cs_error_t error)
101 {
102 }
103
104 static void cfg_shutdown_cb(corosync_cfg_handle_t cfg_handle,
105                 corosync_cfg_shutdown_flags_t flags)
106 {
107 }
108
109 static void cpg_deliver_cb(cpg_handle_t handle, const struct cpg_name *group_name,
110                 uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
111 {
112         struct ast_event *event;
113
114         if (msg_len < ast_event_minimum_length()) {
115                 ast_debug(1, "Ignoring event that's too small. %u < %u\n",
116                         (unsigned int) msg_len,
117                         (unsigned int) ast_event_minimum_length());
118                 return;
119         }
120
121         if (!ast_eid_cmp(&ast_eid_default, ast_event_get_ie_raw(msg, AST_EVENT_IE_EID))) {
122                 /* Don't feed events back in that originated locally. */
123                 return;
124         }
125
126         ast_rwlock_rdlock(&event_types_lock);
127         if (!event_types[ast_event_get_type(msg)].subscribe) {
128                 /* We are not configured to subscribe to these events. */
129                 ast_rwlock_unlock(&event_types_lock);
130                 return;
131         }
132         ast_rwlock_unlock(&event_types_lock);
133
134         if (!(event = ast_malloc(msg_len))) {
135                 return;
136         }
137
138         memcpy(event, msg, msg_len);
139
140         ast_event_queue_and_cache(event);
141 }
142
143 static void cpg_confchg_cb(cpg_handle_t handle, const struct cpg_name *group_name,
144                 const struct cpg_address *member_list, size_t member_list_entries,
145                 const struct cpg_address *left_list, size_t left_list_entries,
146                 const struct cpg_address *joined_list, size_t joined_list_entries)
147 {
148         unsigned int i;
149
150         /* If any new nodes have joined, dump our cache of events we are publishing
151          * that originated from this server. */
152
153         if (!joined_list_entries) {
154                 return;
155         }
156
157         for (i = 0; i < ARRAY_LEN(event_types); i++) {
158                 struct ast_event_sub *event_sub;
159
160                 ast_rwlock_rdlock(&event_types_lock);
161                 if (!event_types[i].publish) {
162                         ast_rwlock_unlock(&event_types_lock);
163                         continue;
164                 }
165                 ast_rwlock_unlock(&event_types_lock);
166
167                 event_sub = ast_event_subscribe_new(i, ast_event_cb, NULL);
168                 ast_event_sub_append_ie_raw(event_sub, AST_EVENT_IE_EID,
169                                         &ast_eid_default, sizeof(ast_eid_default));
170                 ast_event_dump_cache(event_sub);
171                 ast_event_sub_destroy(event_sub);
172         }
173 }
174
175 static void *dispatch_thread_handler(void *data)
176 {
177         cs_error_t cs_err;
178         struct pollfd pfd[3] = {
179                 { .events = POLLIN, },
180                 { .events = POLLIN, },
181                 { .events = POLLIN, },
182         };
183
184         if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
185                 ast_log(LOG_ERROR, "Failed to get CPG fd.  This module is now broken.\n");
186                 return NULL;
187         }
188
189         if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
190                 ast_log(LOG_ERROR, "Failed to get CFG fd.  This module is now broken.\n");
191                 return NULL;
192         }
193
194         pfd[2].fd = dispatch_thread.alert_pipe[0];
195
196         while (!dispatch_thread.stop) {
197                 int res;
198
199                 cs_err = CS_OK;
200
201                 pfd[0].revents = 0;
202                 pfd[1].revents = 0;
203                 pfd[2].revents = 0;
204
205                 res = ast_poll(pfd, ARRAY_LEN(pfd), -1);
206                 if (res == -1 && errno != EINTR && errno != EAGAIN) {
207                         ast_log(LOG_ERROR, "poll() error: %s (%d)\n", strerror(errno), errno);
208                         continue;
209                 }
210
211                 if (pfd[0].revents & POLLIN) {
212                         if ((cs_err = cpg_dispatch(cpg_handle, CS_DISPATCH_ALL)) != CS_OK) {
213                                 ast_log(LOG_WARNING, "Failed CPG dispatch: %d\n", cs_err);
214                         }
215                 }
216
217                 if (pfd[1].revents & POLLIN) {
218                         if ((cs_err = corosync_cfg_dispatch(cfg_handle, CS_DISPATCH_ALL)) != CS_OK) {
219                                 ast_log(LOG_WARNING, "Failed CFG dispatch: %d\n", cs_err);
220                         }
221                 }
222
223                 if (cs_err == CS_ERR_LIBRARY || cs_err == CS_ERR_BAD_HANDLE) {
224                         struct cpg_name name;
225
226                         /* If corosync gets restarted out from under Asterisk, try to recover. */
227
228                         ast_log(LOG_NOTICE, "Attempting to recover from corosync failure.\n");
229
230                         if ((cs_err = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
231                                 ast_log(LOG_ERROR, "Failed to initialize cfg (%d)\n", (int) cs_err);
232                                 sleep(5);
233                                 continue;
234                         }
235
236                         if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks) != CS_OK)) {
237                                 ast_log(LOG_ERROR, "Failed to initialize cpg (%d)\n", (int) cs_err);
238                                 sleep(5);
239                                 continue;
240                         }
241
242                         if ((cs_err = cpg_fd_get(cpg_handle, &pfd[0].fd)) != CS_OK) {
243                                 ast_log(LOG_ERROR, "Failed to get CPG fd.\n");
244                                 sleep(5);
245                                 continue;
246                         }
247
248                         if ((cs_err = corosync_cfg_fd_get(cfg_handle, &pfd[1].fd)) != CS_OK) {
249                                 ast_log(LOG_ERROR, "Failed to get CFG fd.\n");
250                                 sleep(5);
251                                 continue;
252                         }
253
254                         ast_copy_string(name.value, "asterisk", sizeof(name.value));
255                         name.length = strlen(name.value);
256                         if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
257                                 ast_log(LOG_ERROR, "Failed to join cpg (%d)\n", (int) cs_err);
258                                 sleep(5);
259                                 continue;
260                         }
261
262                         ast_log(LOG_NOTICE, "Corosync recovery complete.\n");
263                 }
264         }
265
266         return NULL;
267 }
268
269 static void ast_event_cb(const struct ast_event *event, void *data)
270 {
271         cs_error_t cs_err;
272         struct iovec iov = {
273                 .iov_base = (void *) event,
274                 .iov_len = ast_event_get_size(event),
275         };
276
277         if (ast_eid_cmp(&ast_eid_default,
278                         ast_event_get_ie_raw(event, AST_EVENT_IE_EID))) {
279                 /* If the event didn't originate from this server, don't send it back out. */
280                 return;
281         }
282
283         /* The ast_event subscription will only exist if we are configured to publish
284          * these events, so just send away. */
285
286         if ((cs_err = cpg_mcast_joined(cpg_handle, CPG_TYPE_FIFO, &iov, 1)) != CS_OK) {
287                 ast_log(LOG_WARNING, "CPG mcast failed (%d)\n", cs_err);
288         }
289 }
290
291 static char *corosync_show_members(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a)
292 {
293         cs_error_t cs_err;
294         cpg_iteration_handle_t cpg_iter;
295         struct cpg_iteration_description_t cpg_desc;
296         unsigned int i;
297
298         switch (cmd) {
299         case CLI_INIT:
300                 e->command = "corosync show members";
301                 e->usage =
302                         "Usage: corosync show members\n"
303                         "       Show corosync cluster members\n";
304                 return NULL;
305
306         case CLI_GENERATE:
307                 return NULL;    /* no completion */
308         }
309
310         if (a->argc != e->args) {
311                 return CLI_SHOWUSAGE;
312         }
313
314         cs_err = cpg_iteration_initialize(cpg_handle, CPG_ITERATION_ALL, NULL, &cpg_iter);
315
316         if (cs_err != CS_OK) {
317                 ast_cli(a->fd, "Failed to initialize CPG iterator.\n");
318                 return CLI_FAILURE;
319         }
320
321         ast_cli(a->fd, "\n"
322                     "=============================================================\n"
323                     "=== Cluster members =========================================\n"
324                     "=============================================================\n"
325                     "===\n");
326
327         for (i = 1, cs_err = cpg_iteration_next(cpg_iter, &cpg_desc);
328                         cs_err == CS_OK;
329                         cs_err = cpg_iteration_next(cpg_iter, &cpg_desc), i++) {
330                 corosync_cfg_node_address_t addrs[8];
331                 int num_addrs = 0;
332                 unsigned int j;
333
334                 cs_err = corosync_cfg_get_node_addrs(cfg_handle, cpg_desc.nodeid,
335                                 ARRAY_LEN(addrs), &num_addrs, addrs);
336                 if (cs_err != CS_OK) {
337                         ast_log(LOG_WARNING, "Failed to get node addresses\n");
338                         continue;
339                 }
340
341                 ast_cli(a->fd, "=== Node %d\n", i);
342                 ast_cli(a->fd, "=== --> Group: %s\n", cpg_desc.group.value);
343
344                 for (j = 0; j < num_addrs; j++) {
345                         struct sockaddr *sa = (struct sockaddr *) addrs[j].address;
346                         size_t sa_len = (size_t) addrs[j].address_length;
347                         char buf[128];
348
349                         getnameinfo(sa, sa_len, buf, sizeof(buf), NULL, 0, NI_NUMERICHOST);
350
351                         ast_cli(a->fd, "=== --> Address %d: %s\n", j + 1, buf);
352                 }
353
354         }
355
356         ast_cli(a->fd, "===\n"
357                        "=============================================================\n"
358                        "\n");
359
360         cpg_iteration_finalize(cpg_iter);
361
362         return CLI_SUCCESS;
363 }
364
365 static char *corosync_show_config(struct ast_cli_entry *e, int cmd, struct ast_cli_args *a)
366 {
367         unsigned int i;
368
369         switch (cmd) {
370         case CLI_INIT:
371                 e->command = "corosync show config";
372                 e->usage =
373                         "Usage: corosync show config\n"
374                         "       Show configuration loaded from res_corosync.conf\n";
375                 return NULL;
376
377         case CLI_GENERATE:
378                 return NULL;    /* no completion */
379         }
380
381         if (a->argc != e->args) {
382                 return CLI_SHOWUSAGE;
383         }
384
385         ast_cli(a->fd, "\n"
386                     "=============================================================\n"
387                     "=== res_corosync config =====================================\n"
388                     "=============================================================\n"
389                     "===\n");
390
391         ast_rwlock_rdlock(&event_types_lock);
392         for (i = 0; i < ARRAY_LEN(event_types); i++) {
393                 if (event_types[i].publish) {
394                         ast_cli(a->fd, "=== ==> Publishing Event Type: %s\n",
395                                         event_types[i].name);
396                 }
397                 if (event_types[i].subscribe) {
398                         ast_cli(a->fd, "=== ==> Subscribing to Event Type: %s\n",
399                                         event_types[i].name);
400                 }
401         }
402         ast_rwlock_unlock(&event_types_lock);
403
404         ast_cli(a->fd, "===\n"
405                        "=============================================================\n"
406                        "\n");
407
408         return CLI_SUCCESS;
409 }
410
411 static struct ast_cli_entry corosync_cli[] = {
412         AST_CLI_DEFINE(corosync_show_config, "Show configuration"),
413         AST_CLI_DEFINE(corosync_show_members, "Show cluster members"),
414 };
415
416 enum {
417         PUBLISH,
418         SUBSCRIBE,
419 };
420
421 static int set_event(const char *event_type, int pubsub)
422 {
423         unsigned int i;
424
425         for (i = 0; i < ARRAY_LEN(event_types); i++) {
426                 if (!event_types[i].name || strcasecmp(event_type, event_types[i].name)) {
427                         continue;
428                 }
429
430                 switch (pubsub) {
431                 case PUBLISH:
432                         event_types[i].publish = 1;
433                         break;
434                 case SUBSCRIBE:
435                         event_types[i].subscribe = 1;
436                         break;
437                 }
438
439                 break;
440         }
441
442         return (i == ARRAY_LEN(event_types)) ? -1 : 0;
443 }
444
445 static int load_general_config(struct ast_config *cfg)
446 {
447         struct ast_variable *v;
448         int res = 0;
449         unsigned int i;
450
451         ast_rwlock_wrlock(&event_types_lock);
452
453         for (i = 0; i < ARRAY_LEN(event_types); i++) {
454                 event_types[i].publish = 0;
455                 event_types[i].subscribe = 0;
456         }
457
458         for (v = ast_variable_browse(cfg, "general"); v && !res; v = v->next) {
459                 if (!strcasecmp(v->name, "publish_event")) {
460                         res = set_event(v->value, PUBLISH);
461                 } else if (!strcasecmp(v->name, "subscribe_event")) {
462                         res = set_event(v->value, SUBSCRIBE);
463                 } else {
464                         ast_log(LOG_WARNING, "Unknown option '%s'\n", v->name);
465                 }
466         }
467
468         for (i = 0; i < ARRAY_LEN(event_types); i++) {
469                 if (event_types[i].publish && !event_types[i].sub) {
470                         event_types[i].sub = ast_event_subscribe(i,
471                                                 ast_event_cb, "Corosync", NULL,
472                                                 AST_EVENT_IE_END);
473                 } else if (!event_types[i].publish && event_types[i].sub) {
474                         event_types[i].sub = ast_event_unsubscribe(event_types[i].sub);
475                 }
476         }
477
478         ast_rwlock_unlock(&event_types_lock);
479
480         return res;
481 }
482
483 static int load_config(unsigned int reload)
484 {
485         static const char filename[] = "res_corosync.conf";
486         struct ast_config *cfg;
487         const char *cat = NULL;
488         struct ast_flags config_flags = { 0 };
489         int res = 0;
490
491         cfg = ast_config_load(filename, config_flags);
492
493         if (cfg == CONFIG_STATUS_FILEMISSING || cfg == CONFIG_STATUS_FILEINVALID) {
494                 return -1;
495         }
496
497         while ((cat = ast_category_browse(cfg, cat))) {
498                 if (!strcasecmp(cat, "general")) {
499                         res = load_general_config(cfg);
500                 } else {
501                         ast_log(LOG_WARNING, "Unknown configuration section '%s'\n", cat);
502                 }
503         }
504
505         ast_config_destroy(cfg);
506
507         return res;
508 }
509
510 static void cleanup_module(void)
511 {
512         cs_error_t cs_err;
513         unsigned int i;
514
515         for (i = 0; i < ARRAY_LEN(event_types); i++) {
516                 if (event_types[i].sub) {
517                         event_types[i].sub = ast_event_unsubscribe(event_types[i].sub);
518                 }
519                 event_types[i].publish = 0;
520                 event_types[i].subscribe = 0;
521         }
522
523         if (dispatch_thread.id != AST_PTHREADT_NULL) {
524                 char meepmeep = 'x';
525                 dispatch_thread.stop = 1;
526                 if (ast_carefulwrite(dispatch_thread.alert_pipe[1], &meepmeep, 1,
527                                         5000) == -1) {
528                         ast_log(LOG_ERROR, "Failed to write to pipe: %s (%d)\n",
529                                         strerror(errno), errno);
530                 }
531                 pthread_join(dispatch_thread.id, NULL);
532         }
533
534         if (dispatch_thread.alert_pipe[0] != -1) {
535                 close(dispatch_thread.alert_pipe[0]);
536                 dispatch_thread.alert_pipe[0] = -1;
537         }
538
539         if (dispatch_thread.alert_pipe[1] != -1) {
540                 close(dispatch_thread.alert_pipe[1]);
541                 dispatch_thread.alert_pipe[1] = -1;
542         }
543
544         if (cpg_handle && (cs_err = cpg_finalize(cpg_handle)) != CS_OK) {
545                 ast_log(LOG_ERROR, "Failed to finalize cpg (%d)\n", (int) cs_err);
546         }
547         cpg_handle = 0;
548
549         if (cfg_handle && (cs_err = corosync_cfg_finalize(cfg_handle)) != CS_OK) {
550                 ast_log(LOG_ERROR, "Failed to finalize cfg (%d)\n", (int) cs_err);
551         }
552         cfg_handle = 0;
553 }
554
555 static int load_module(void)
556 {
557         cs_error_t cs_err;
558         enum ast_module_load_result res = AST_MODULE_LOAD_FAILURE;
559         struct cpg_name name;
560
561         if ((cs_err = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks)) != CS_OK) {
562                 ast_log(LOG_ERROR, "Failed to initialize cfg (%d)\n", (int) cs_err);
563                 return AST_MODULE_LOAD_DECLINE;
564         }
565
566         if ((cs_err = cpg_initialize(&cpg_handle, &cpg_callbacks)) != CS_OK) {
567                 ast_log(LOG_ERROR, "Failed to initialize cpg (%d)\n", (int) cs_err);
568                 goto failed;
569         }
570
571         ast_copy_string(name.value, "asterisk", sizeof(name.value));
572         name.length = strlen(name.value);
573
574         if ((cs_err = cpg_join(cpg_handle, &name)) != CS_OK) {
575                 ast_log(LOG_ERROR, "Failed to join (%d)\n", (int) cs_err);
576                 goto failed;
577         }
578
579         if (pipe(dispatch_thread.alert_pipe) == -1) {
580                 ast_log(LOG_ERROR, "Failed to create alert pipe: %s (%d)\n",
581                                 strerror(errno), errno);
582                 goto failed;
583         }
584
585         if (ast_pthread_create_background(&dispatch_thread.id, NULL,
586                         dispatch_thread_handler, NULL)) {
587                 ast_log(LOG_ERROR, "Error starting CPG dispatch thread.\n");
588                 goto failed;
589         }
590
591         if (load_config(0)) {
592                 /* simply not configured is not a fatal error */
593                 res = AST_MODULE_LOAD_DECLINE;
594                 goto failed;
595         }
596
597         ast_cli_register_multiple(corosync_cli, ARRAY_LEN(corosync_cli));
598
599         ast_enable_distributed_devstate();
600
601         return AST_MODULE_LOAD_SUCCESS;
602
603 failed:
604         cleanup_module();
605
606         return res;
607 }
608
609 static int unload_module(void)
610 {
611         ast_cli_unregister_multiple(corosync_cli, ARRAY_LEN(corosync_cli));
612
613         cleanup_module();
614
615         return 0;
616 }
617
618 AST_MODULE_INFO_STANDARD(ASTERISK_GPL_KEY, "Corosync");