]> git.proxmox.com Git - mirror_corosync.git/commitdiff
stats: Add stats for scheduler misses
authorChristine Caulfield <ccaulfie@redhat.com>
Fri, 17 Jan 2020 14:22:16 +0000 (14:22 +0000)
committerJan Friesse <jfriesse@redhat.com>
Wed, 22 Jan 2020 16:06:10 +0000 (17:06 +0100)
This patch add a stats.schedmiss.* set of entries that
are a record of the last 10 times corosync was not scheduled
in time.

These entries are keypt in reverse order (so stats.schedmiss.0.* is
always the latest one kept) and the values, including the timestamp,
are in milliseconds.

It's also possible to use a cmap tracker to follow these events, which
might be useful.

Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
exec/main.c
exec/stats.c
exec/stats.h
man/cmap_keys.7
tools/corosync-cmapctl.c

index 7a471a16343b8f4c38f5bf15d740fd568f3ff435..fb0486e7a273fdd42beed32bfc21a7a7f21915ee 100644 (file)
@@ -835,6 +835,8 @@ static void timer_function_scheduler_timeout (void *data)
                log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms "
                    "(threshold is %0.4f ms). Consider token timeout increase.",
                    (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC);
+
+               stats_add_schedmiss_event(tv_current / 1000, (float)tv_diff / QB_TIME_NS_IN_MSEC);
        }
 
        /*
index e89504e66b67551070de1bc1552799e4da9d1d4d..d5c1cbcdd611a40f515a29c2bfcdc24b984c1530 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Red Hat, Inc.
+ * Copyright (c) 2017-2020 Red Hat, Inc.
  *
  * All rights reserved.
  *
@@ -60,9 +60,20 @@ LOGSYS_DECLARE_SUBSYS ("STATS");
 
 static qb_map_t *stats_map;
 
+/* Structure of an element in the schedmiss array */
+struct schedmiss_entry {
+       uint64_t timestamp;
+       float delay;
+};
+#define MAX_SCHEDMISS_EVENTS 10
+static struct schedmiss_entry schedmiss_event[MAX_SCHEDMISS_EVENTS];
+static unsigned int highest_schedmiss_event;
+
+#define SCHEDMISS_PREFIX "stats.schedmiss"
+
 /* Convert iterator number to text and a stats pointer */
 struct cs_stats_conv {
-       enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG} type;
+       enum {STAT_PG, STAT_SRP, STAT_KNET, STAT_KNET_HANDLE, STAT_IPCSC, STAT_IPCSG, STAT_SCHEDMISS} type;
        const char *name;
        const size_t offset;
        const icmap_value_types_t value_type;
@@ -190,6 +201,10 @@ struct cs_stats_conv cs_ipcs_global_stats[] = {
        { STAT_IPCSG, "global.active",        offsetof(struct ipcs_global_stats, active),           ICMAP_VALUETYPE_UINT64},
        { STAT_IPCSG, "global.closed",        offsetof(struct ipcs_global_stats, closed),           ICMAP_VALUETYPE_UINT64},
 };
+struct cs_stats_conv cs_schedmiss_stats[] = {
+       { STAT_SCHEDMISS, "timestamp",    offsetof(struct schedmiss_entry, timestamp), ICMAP_VALUETYPE_UINT64},
+       { STAT_SCHEDMISS, "delay",        offsetof(struct schedmiss_entry, delay),     ICMAP_VALUETYPE_FLOAT},
+};
 
 #define NUM_PG_STATS (sizeof(cs_pg_stats) / sizeof(struct cs_stats_conv))
 #define NUM_SRP_STATS (sizeof(cs_srp_stats) / sizeof(struct cs_stats_conv))
@@ -286,7 +301,7 @@ cs_error_t stats_map_init(const struct corosync_api_v1 *corosync_api)
                stats_add_entry(param, &cs_ipcs_global_stats[i]);
        }
 
-       /* KNET and IPCS stats are added when appropriate */
+       /* KNET, IPCS & SCHEDMISS stats are added when appropriate */
        return CS_OK;
 }
 
@@ -307,6 +322,8 @@ cs_error_t stats_map_get(const char *key_name,
        int link_no;
        int service_id;
        uint32_t pid;
+       unsigned int sm_event;
+       char *sm_type;
        void *conn_ptr;
 
        item = qb_map_get(stats_map, key_name);
@@ -363,17 +380,85 @@ cs_error_t stats_map_get(const char *key_name,
                        cs_ipcs_get_global_stats(&ipcs_global_stats);
                        stats_map_set_value(statinfo, &ipcs_global_stats, value, value_len, type);
                        break;
+               case STAT_SCHEDMISS:
+                       if (sscanf(key_name, SCHEDMISS_PREFIX ".%d", &sm_event) != 1) {
+                               return CS_ERR_NOT_EXIST;
+                       }
+
+                       sm_type = strrchr(key_name, '.');
+                       if (sm_type == NULL) {
+                               return CS_ERR_NOT_EXIST;
+                       }
+                       sm_type++;
+
+                       if (strcmp(sm_type, "timestamp") == 0) {
+                               memcpy(value, &schedmiss_event[sm_event].timestamp, sizeof(uint64_t));
+                               *value_len = sizeof(uint64_t);
+                               *type = ICMAP_VALUETYPE_UINT64;
+                       }
+                       if (strcmp(sm_type, "delay") == 0) {
+                               memcpy(value, &schedmiss_event[sm_event].delay, sizeof(float));
+                               *value_len = sizeof(float);
+                               *type = ICMAP_VALUETYPE_FLOAT;
+                       }
+                       break;
                default:
                        return CS_ERR_LIBRARY;
        }
        return CS_OK;
 }
 
-#define STATS_CLEAR       "stats.clear."
-#define STATS_CLEAR_KNET  "stats.clear.knet"
-#define STATS_CLEAR_IPC   "stats.clear.ipc"
-#define STATS_CLEAR_TOTEM "stats.clear.totem"
-#define STATS_CLEAR_ALL   "stats.clear.all"
+static void schedmiss_clear_stats(void)
+{
+       int i;
+       char param[ICMAP_KEYNAME_MAXLEN];
+
+       for (i=0; i<MAX_SCHEDMISS_EVENTS; i++) {
+               if (i < highest_schedmiss_event) {
+                       sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", i);
+                       stats_rm_entry(param);
+                       sprintf(param, SCHEDMISS_PREFIX ".%i.delay", i);
+                       stats_rm_entry(param);
+               }
+               schedmiss_event[i].timestamp = (uint64_t)0LL;
+               schedmiss_event[i].delay = 0.0f;
+       }
+       highest_schedmiss_event = 0;
+}
+
+/* Called from main.c */
+void stats_add_schedmiss_event(uint64_t timestamp, float delay)
+{
+       char param[ICMAP_KEYNAME_MAXLEN];
+       int i;
+
+       /* Move 'em all up */
+       for (i=MAX_SCHEDMISS_EVENTS-2; i>=0; i--) {
+               schedmiss_event[i+1].timestamp = schedmiss_event[i].timestamp;
+               schedmiss_event[i+1].delay = schedmiss_event[i].delay;
+       }
+
+       /* New entries are always at the front */
+       schedmiss_event[0].timestamp = timestamp;
+       schedmiss_event[0].delay = delay;
+
+       /* If we've not run off the end then add an entry in the trie for the new 'end' one */
+       if (highest_schedmiss_event < MAX_SCHEDMISS_EVENTS) {
+               sprintf(param, SCHEDMISS_PREFIX ".%i.timestamp", highest_schedmiss_event);
+               stats_add_entry(param, &cs_schedmiss_stats[0]);
+               sprintf(param, SCHEDMISS_PREFIX ".%i.delay", highest_schedmiss_event);
+               stats_add_entry(param, &cs_schedmiss_stats[1]);
+               highest_schedmiss_event++;
+       }
+       /* Notifications get sent by the stats_updater */
+}
+
+#define STATS_CLEAR           "stats.clear."
+#define STATS_CLEAR_KNET      "stats.clear.knet"
+#define STATS_CLEAR_IPC       "stats.clear.ipc"
+#define STATS_CLEAR_TOTEM     "stats.clear.totem"
+#define STATS_CLEAR_ALL       "stats.clear.all"
+#define STATS_CLEAR_SCHEDMISS "stats.clear.schedmiss"
 
 cs_error_t stats_map_set(const char *key_name,
                         const void *value,
@@ -394,9 +479,14 @@ cs_error_t stats_map_set(const char *key_name,
                totempg_stats_clear(TOTEMPG_STATS_CLEAR_TOTEM);
                cleared = 1;
        }
+       if (strncmp(key_name, STATS_CLEAR_SCHEDMISS, strlen(STATS_CLEAR_SCHEDMISS)) == 0) {
+               schedmiss_clear_stats();
+               cleared = 1;
+       }
        if (strncmp(key_name, STATS_CLEAR_ALL, strlen(STATS_CLEAR_ALL)) == 0) {
                totempg_stats_clear(TOTEMPG_STATS_CLEAR_TRANSPORT | TOTEMPG_STATS_CLEAR_TOTEM);
                cs_ipcs_clear_stats();
+               schedmiss_clear_stats();
                cleared = 1;
        }
        if (!cleared) {
@@ -500,6 +590,11 @@ static void stats_map_notify_fn(uint32_t event, char *key, void *old_value, void
                return ;
        }
 
+       /* Ignore schedmiss trackers as the values are read from the circular buffer */
+       if (strncmp(key, SCHEDMISS_PREFIX, strlen(SCHEDMISS_PREFIX)) == 0 ) {
+               return ;
+       }
+
        new_val.data = new_value;
        if (stats_map_get(key,
                          &new_value,
@@ -556,7 +651,7 @@ cs_error_t stats_map_track_add(const char *key_name,
                }
                /* Get initial value */
                if (stats_map_get(tracker->key_name,
-                                 &tracker->old_value, &value_len, &type) == CS_OK) {
+                                 &tracker->old_value, &value_len, &type) != CS_OK) {
                        tracker->old_value = 0ULL;
                }
        } else {
index 45891ae669c9f61c39353db42ced0ecda22e1c8e..eac9e7cdd7a075c92ab03af54d762563a2efe668 100644 (file)
@@ -69,3 +69,5 @@ void stats_trigger_trackers(void);
 void stats_ipcs_add_connection(int service_id, uint32_t pid, void *ptr);
 void stats_ipcs_del_connection(int service_id, uint32_t pid, void *ptr);
 cs_error_t cs_ipcs_get_conn_stats(int service_id, uint32_t pid, void *conn_ptr, struct ipcs_conn_stats *ipcs_stats);
+
+void stats_add_schedmiss_event(uint64_t, float delay);
index 6bc04fe1df1407b33897218629451a887e3ede04..da95c51d0ab0a206149a76f501b23ec25a20c054 100644 (file)
@@ -1,5 +1,5 @@
 .\"/*
-.\" * Copyright (c) 2012-2018 Red Hat, Inc.
+.\" * Copyright (c) 2012-2020 Red Hat, Inc.
 .\" *
 .\" * All rights reserved.
 .\" *
@@ -357,6 +357,27 @@ contains the total number of interrupted sends.
 .B service_id
 contains the ID of service which the IPC is connected to.
 
+
+.TP
+stats.schedmiss.<n>.*
+If corosync is not scheduled after the required period of time it will
+log this event and also write an entry to the stats cmap under this key.
+There can be up to 10 entries (0..9) in here, when an 11th event happens
+the earliest will be removed.
+
+These events will always be in reverse order, so stats.schedmiss.0.* will
+always be the latest event kept and 9 the oldest. If you want to listen
+for notifications then you are recommended to listen for changes
+to stats.schedmiss.0.timestamp or stats.schedmiss.0.delay.
+
+.B timestamp
+The time of the event in ms since the Epoch (ie time_t * 1000 but with
+valid milliseconds).
+
+.B delay
+The time that corosync was paused (in ms, float value).
+
+
 .TP
 stats.clear.*
 These are write-only keys used to clear the stats for various subsystems
@@ -370,6 +391,9 @@ Clears the knet stats
 .B ipc
 Clears the ipc stats
 
+.B schedmiss
+Clears the schedmiss stats
+
 .B all
 Clears all of the above stats
 
index a4b61bd5df4e36e15462ca4fc18f8bc198d1d0cb..ffca7e1b7138b22d2ef766a72b2e8e7078378db4 100644 (file)
@@ -115,7 +115,7 @@ static int print_help(void)
        printf("    about the networking and IPC traffic in some detail.\n");
        printf("\n");
        printf("Clear stats:\n");
-       printf("    corosync-cmapctl -C [knet|ipc|totem|all]\n");
+       printf("    corosync-cmapctl -C [knet|ipc|totem|schedmiss|all]\n");
        printf("    The 'stats' map is implied\n");
        printf("\n");
        printf("Load settings from a file:\n");
@@ -849,6 +849,7 @@ int main(int argc, char *argv[])
                        if (strcmp(optarg, "knet") == 0 ||
                            strcmp(optarg, "totem") == 0 ||
                            strcmp(optarg, "ipc") == 0 ||
+                           strcmp(optarg, "schedmiss") == 0 ||
                            strcmp(optarg, "all") == 0) {
                                action = ACTION_CLEARSTATS;
                                clear_opt = optarg;
@@ -857,7 +858,7 @@ int main(int argc, char *argv[])
                                map = CMAP_MAP_STATS;
                        }
                        else {
-                               fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc' or 'all'\n");
+                               fprintf(stderr, "argument to -C should be 'knet', 'totem', 'ipc', 'schedmiss' or 'all'\n");
                                return (EXIT_FAILURE);
                        }
                        break;