diff --git a/app/services/dispatcher/messages_in_flight.js b/app/services/dispatcher/messages_in_flight.js index 98a7de526926da5940d8652ae755bec14625c9ee..a0ec1ec0c3086817d3e5668c735dce01ea23ad85 100644 --- a/app/services/dispatcher/messages_in_flight.js +++ b/app/services/dispatcher/messages_in_flight.js @@ -2,13 +2,59 @@ const _ = require('lodash') const metrics = require('./metrics') const inFlight = metrics.channelMessagesInFlight +const durations = metrics.channelMessageDuration -function launch(channel, recipient) { - inFlight.inc({ channel }) +/* + A hash table of timers, one per recipient message sent. + + Each key is channel+recipient, and each value is an array of stop-timer lambdas + provided by prom-client to record the duration of the timer. + + Note that this could consume considerable memory + if many thousands of messages (message = channel+recipient) + are in flight at once. + + A scalable alternative would be to store each channel+recipient's launch timestamp + in a data store, and retrieve it to record the duration at land time. + + Using a data store for queued messages would also provide durability if used instead + of DBUS as signald's queue, so worth exploring overall. +*/ +const timers = {} + +function startTimer(channel, recipient) { + const key = channel+recipient + timers[key] = [...timers[key] || [], + durations.startTimer({ channel, recipient })] +} + +function stopTimer(channel, recipient) { + const key = channel+recipient + + // since these come from outside, let's not assume this can't happen + if (! timers[key]) + console.error("Received message receipt for a message without a timer!") + else { + + const stop = timers[key][0] + stop() + + const remaining = timers[key].slice(1) + if (! remaining) + delete timers[key] + else + timers[key] = remaining + } +} + +function launch(channel, recipient) { + inFlight.inc({ channel }) + startTimer(channel, recipient) } -function land(channel, recipient) { +function land(channel, recipient) { inFlight.dec({ channel }) + stopTimer(channel, recipient) } module.exports = { diff --git a/app/services/dispatcher/metrics.js b/app/services/dispatcher/metrics.js index 1780ad29c050713a878717610b98ee642df2f849..b778996a5fe800c30e57806d06cf6c49a7bada5c 100644 --- a/app/services/dispatcher/metrics.js +++ b/app/services/dispatcher/metrics.js @@ -18,12 +18,20 @@ const channelMessagesInFlight = new prometheus.Gauge(register({ labelNames: ['channel'] })) +const channelMessageDuration = new prometheus.Gauge(register({ + name: 'channel_message_duration', + help: 'An estimate of the duration of each channel+recipient message broadcast ' + + 'from sending it to signald to receiving isReceipt: true from signald.', + labelNames: ['channel', 'recipient'] +})) + module.exports = { registry, collectDefaults, signaldMessages, channelMessages, - channelMessagesInFlight + channelMessagesInFlight, + channelMessageDuration } diff --git a/app/services/registrar/run.js b/app/services/registrar/run.js index 2dbd0ebddbb560113466d972d29bc54f9457a44f..04c9c337d9aa7e23fc2c6ca81d66422abb167920 100644 --- a/app/services/registrar/run.js +++ b/app/services/registrar/run.js @@ -12,7 +12,7 @@ const { const run = async (db, sock) => { logger.log('--- Initializing Registrar...') -// metrics.collectDefaults() + metrics.collectDefaults() logger.log('----- Starting collection of default prometheus metrics for Registrar service.') logger.log(`----- Starting Registrar api server...`) diff --git a/conf/grafana/provisioning/dashboards/Signalboost node app-1591679176625.json b/conf/grafana/provisioning/dashboards/Signalboost node app-1591679176625.json index 88fdf96faf6d0dca2d84ac5749f8e134b8e596c9..410d8897d2b06819440aba5a60d727c7d2345bf4 100644 --- a/conf/grafana/provisioning/dashboards/Signalboost node app-1591679176625.json +++ b/conf/grafana/provisioning/dashboards/Signalboost node app-1591679176625.json @@ -23,7 +23,6 @@ "dashLength": 10, "dashes": false, "datasource": null, - "description": "The number of messages per channel that signalboost has sent but not yet received receipt of from signald. Average in flight messages per second over the last hour.", "fieldConfig": { "defaults": { "custom": {} @@ -39,7 +38,7 @@ "y": 0 }, "hiddenSeries": false, - "id": 20, + "id": 22, "legend": { "avg": false, "current": false, @@ -65,7 +64,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(channel_messages_in_flight[1h])", + "expr": "avg(channel_message_duration)", "interval": "", "legendFormat": "", "refId": "A" @@ -75,7 +74,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "channel_messages_in_flight: avg / sec over last 1h", + "title": "channel_message_duration: avg ", "tooltip": { "shared": true, "sort": 0, @@ -117,8 +116,8 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": null, - "description": "Purple is process heap in bytes. Lower yellow line is nodejs heap used.", + "datasource": "Prometheus", + "description": "Time at 1.00 is time app is up.", "fieldConfig": { "defaults": { "custom": {} @@ -128,13 +127,13 @@ "fill": 1, "fillGradient": 0, "gridPos": { - "h": 8, + "h": 9, "w": 12, "x": 12, "y": 0 }, "hiddenSeries": false, - "id": 8, + "id": 2, "legend": { "avg": false, "current": false, @@ -160,29 +159,17 @@ "steppedLine": false, "targets": [ { - "expr": "nodejs_heap_size_used_bytes", + "expr": "up", "interval": "", "legendFormat": "", "refId": "A" - }, - { - "expr": "nodejs_heap_space_size_available_bytes", - "interval": "", - "legendFormat": "", - "refId": "B" - }, - { - "expr": "process_heap_bytes", - "interval": "", - "legendFormat": "", - "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Heap available / used", + "title": "Uptime", "tooltip": { "shared": true, "sort": 0, @@ -225,6 +212,7 @@ "dashLength": 10, "dashes": false, "datasource": null, + "description": "The number of messages per channel that signalboost has sent but not yet received receipt of from signald. Average in flight messages per second over the last hour.", "fieldConfig": { "defaults": { "custom": {} @@ -240,7 +228,7 @@ "y": 8 }, "hiddenSeries": false, - "id": 18, + "id": 20, "legend": { "avg": false, "current": false, @@ -266,7 +254,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(signald_message_count[1h]) ", + "expr": "rate(channel_messages_in_flight[1h])", "interval": "", "legendFormat": "", "refId": "A" @@ -276,7 +264,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "signald_message_count (avg / sec over last 1h)", + "title": "channel_messages_in_flight: avg / sec over last 1h", "tooltip": { "shared": true, "sort": 0, @@ -319,6 +307,7 @@ "dashLength": 10, "dashes": false, "datasource": null, + "description": "Purple is process heap in bytes. Lower yellow line is nodejs heap used.", "fieldConfig": { "defaults": { "custom": {} @@ -331,10 +320,10 @@ "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 9 }, "hiddenSeries": false, - "id": 6, + "id": 8, "legend": { "avg": false, "current": false, @@ -360,23 +349,29 @@ "steppedLine": false, "targets": [ { - "expr": "process_open_fds", + "expr": "nodejs_heap_size_used_bytes", "interval": "", "legendFormat": "", "refId": "A" }, { - "expr": "process_max_fds", + "expr": "nodejs_heap_space_size_available_bytes", "interval": "", "legendFormat": "", "refId": "B" + }, + { + "expr": "process_heap_bytes", + "interval": "", + "legendFormat": "", + "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "File descriptors: max / open", + "title": "Heap available / used", "tooltip": { "shared": true, "sort": 0, @@ -434,7 +429,7 @@ "y": 16 }, "hiddenSeries": false, - "id": 16, + "id": 18, "legend": { "avg": false, "current": false, @@ -460,7 +455,7 @@ "steppedLine": false, "targets": [ { - "expr": "rate(channel_message_count[1h])", + "expr": "rate(signald_message_count[1h]) ", "interval": "", "legendFormat": "", "refId": "A" @@ -470,7 +465,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "channel_message_count (avg / sec over last 1h)", + "title": "signald_message_count (avg / sec over last 1h)", "tooltip": { "shared": true, "sort": 0, @@ -525,10 +520,10 @@ "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 17 }, "hiddenSeries": false, - "id": 10, + "id": 6, "legend": { "avg": false, "current": false, @@ -554,17 +549,23 @@ "steppedLine": false, "targets": [ { - "expr": "nodejs_active_requests_total", + "expr": "process_open_fds", "interval": "", "legendFormat": "", "refId": "A" + }, + { + "expr": "process_max_fds", + "interval": "", + "legendFormat": "", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Process active requests", + "title": "File descriptors: max / open", "tooltip": { "shared": true, "sort": 0, @@ -606,8 +607,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "Prometheus", - "description": "Time at 1.00 is time app is up.", + "datasource": null, "fieldConfig": { "defaults": { "custom": {} @@ -617,13 +617,13 @@ "fill": 1, "fillGradient": 0, "gridPos": { - "h": 9, + "h": 8, "w": 12, "x": 0, "y": 24 }, "hiddenSeries": false, - "id": 2, + "id": 16, "legend": { "avg": false, "current": false, @@ -649,7 +649,7 @@ "steppedLine": false, "targets": [ { - "expr": "up", + "expr": "rate(channel_message_count[1h])", "interval": "", "legendFormat": "", "refId": "A" @@ -659,7 +659,101 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Uptime", + "title": "channel_message_count (avg / sec over last 1h)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 25 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "nodejs_active_requests_total", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Process active requests", "tooltip": { "shared": true, "sort": 0, @@ -714,7 +808,7 @@ "h": 8, "w": 12, "x": 0, - "y": 33 + "y": 32 }, "hiddenSeries": false, "id": 14, @@ -808,7 +902,7 @@ "h": 8, "w": 12, "x": 0, - "y": 41 + "y": 40 }, "hiddenSeries": false, "id": 12, @@ -912,5 +1006,5 @@ "timezone": "", "title": "Signalboost node app", "uid": "fV4wThmMk", - "version": 7 + "version": 8 }