Commit 83b8f2d5 authored by aguestuser's avatar aguestuser

Merge branch '340-only-alert-after-2-consecutive-health-check-failures' into 'main'

Resolve "only alert after 2 consecuitve healthcheck failures"

Closes #340

See merge request !391
parents 2e58c084 eab62d8f
const defaults = {
healthcheckInterval: 1000 * 60 * 15, // 15 min
healthcheckInterval: 1000 * 60 * 10, // 10 min
hotlineMessageExpiryInMillis: 1000 * 60 * 60 * 24 * 28, // 4 weeks
inviteDeletionInterval: 1000 * 60 * 60, // 1 hour
inviteExpiryInMillis: 1000 * 60 * 60 * 24 * 14, // 2 weeks
......
......@@ -4,8 +4,8 @@ const defaults = {
broadcastSpacing: 100, // 100 millis
defaultMessageExpiryTime: 60 * 60 * 24 * 7, // 1 week
expiryUpdateDelay: 200, // 200 millis
healthcheckTimeout: 1000 * 60 * 15, // 15 min
healtcheckSpacing: 100, // 100 millis
healthcheckTimeout: 1000 * 60 * 10, // 10 min
healthcheckSpacing: 50, // 50 millis
intervalBetweenRegistrationBatches: 120000, // 2 minutes
intervalBetweenRegistrations: 2000, // 2 seconds
keystorePath: '/var/lib/signald/data', // given by docker-compose file(s)
......@@ -27,7 +27,8 @@ const test = {
broadcastBatchInterval: 10, // 10 millis
broadcastBatchSize: 1,
expiryUpdateDelay: 1, // millis
healthcheckTimeout: 30, // millis
healthcheckTimeout: 20, // millis
healthcheckSpacing: 1, // millis
intervalBetweenRegistrationBatches: 30, // millis
intervalBetweenRegistrations: 5, // millis,
maxResendInterval: 256, // ~ 2.5 sec,
......@@ -44,7 +45,7 @@ const test = {
const development = {
...defaults,
healthcheckTimeout: 1000 * 60, // 60 sec
healthcheckTimeout: 1000 * 30, // 30 sec
}
module.exports = {
......
......@@ -3,14 +3,18 @@ const util = require('./util')
const signal = require('./signal')
const { messageTypes } = require('./signal/constants')
const metrics = require('./metrics')
const notifier = require('./notifier')
const { zip } = require('lodash')
const { sdMessageOf } = require('./signal/constants')
const {
signal: { diagnosticsPhoneNumber, healthcheckSpacing },
signal: { diagnosticsPhoneNumber, healthcheckSpacing, healthcheckTimeout },
} = require('./config')
const logger = util.loggerOf('diagnostics')
// Set<string>
const failedHealthchecks = new Set()
// () => Promise<void>
const sendHealthchecks = async () => {
try {
......@@ -21,24 +25,30 @@ const sendHealthchecks = async () => {
channelPhoneNumbers.map(phoneNumber => () => signal.healthcheck(phoneNumber)),
healthcheckSpacing,
)
zip(channelPhoneNumbers, responseTimes).forEach(([channelPhoneNumber, responseTime]) => {
metrics.setGauge(metrics.gauges.CHANNEL_HEALTH, responseTime, [channelPhoneNumber])
if (responseTime === -1) _sendTimeoutAlerts(channelPhoneNumber)
})
await Promise.all(
zip(channelPhoneNumbers, responseTimes).map(([channelPhoneNumber, responseTime]) => {
metrics.setGauge(metrics.gauges.CHANNEL_HEALTH, responseTime, [channelPhoneNumber])
if (responseTime === -1)
return _handleFailedHealtcheck(channelPhoneNumber, channelPhoneNumbers.length)
}),
)
} catch (e) {
logger.error(e)
}
}
const _sendTimeoutAlerts = async channelPhoneNumber => {
const diagnosticChannel = await channelRepository.findDeep(diagnosticsPhoneNumber)
return signal.broadcastMessage(
channelRepository.getAdminPhoneNumbers(diagnosticChannel),
sdMessageOf(
{ phoneNumber: diagnosticsPhoneNumber },
// string -> Promise<void>
const _handleFailedHealtcheck = async (channelPhoneNumber, numHealtchecks) => {
// alert maintainers if channel has failed 2 consecutive healthchecks
if (failedHealthchecks.has(channelPhoneNumber))
await notifier.notifyMaintainers(
`Channel ${channelPhoneNumber} failed to respond to healthcheck`,
),
)
)
// otherwise cache the failure for another round of health checks so we can alert if it fails again
failedHealthchecks.add(channelPhoneNumber)
util
.wait(2 * numHealtchecks * (healthcheckTimeout + healthcheckSpacing))
.then(() => failedHealthchecks.delete(channelPhoneNumber))
}
// (string, string) => Promise<string>
......@@ -54,4 +64,5 @@ const respondToHealthcheck = (channelPhoneNumber, healthcheckId) =>
module.exports = {
respondToHealthcheck,
sendHealthchecks,
failedHealthchecks,
}
......@@ -4,9 +4,8 @@ volumes:
postgres_data:
logs:
# NOTE:
# you can uncomment the below if you'd like to run unit tests behind a VPN.
# do not leave it uncommented as that will mess up CI!
# the below directive makes it possible to run tests behind a VPN
# (feel free to comment it out if it breaks anything for you!)
networks:
default:
......
version: '3'
# NOTE:
# you can uncomment the below if you'd like to run unit tests behind a VPN.
# do not leave it uncommented as that will mess up CI!
# NOTE: the below directive makes it possible to run tests behind a VPN
# (feel free to comment it out if it breaks anythuing for you!)
networks:
default:
......
import { expect } from 'chai'
import { describe, it, before, beforeEach, after, afterEach } from 'mocha'
import { after, afterEach, before, beforeEach, describe, it } from 'mocha'
import sinon from 'sinon'
import app from '../../app'
import testApp from '../support/testApp'
import db from '../../app/db'
import { times } from 'lodash'
import { flatten, map, times } from 'lodash'
import socket from '../../app/socket/write'
import util from '../../app/util'
import signal from '../../app/signal'
import { channelFactory } from '../support/factories/channel'
import { sendHealthchecks } from '../../app/diagnostics'
import { channelFactory, deepChannelFactory } from '../support/factories/channel'
import { failedHealthchecks, sendHealthchecks } from '../../app/diagnostics'
import { messageTypes } from '../../app/signal/constants'
const {
signal: { diagnosticsPhoneNumber },
signal: { diagnosticsPhoneNumber, healthcheckTimeout },
} = require('../../app/config')
describe('diagnostics jobs', () => {
const uuids = times(3, util.genUuid)
let channels, writeStub, readSock
let channels, diagnosticsChannel, writeStub, readSock
const createChannels = async () => {
channels = await Promise.all(times(3, () => app.db.channel.create(channelFactory())))
diagnosticsChannel = await app.db.channel.create(
deepChannelFactory({ phoneNumber: diagnosticsPhoneNumber }),
{
include: [{ model: app.db.membership }],
},
)
}
const destroyAllChannels = async () => {
......@@ -34,7 +41,7 @@ describe('diagnostics jobs', () => {
await app.db.channel.destroy({ where: {}, force: true })
}
const writeToReadSock = outSdMessage => {
const echoBackToSocket = outSdMessage => {
const inSdMessage = {
type: messageTypes.MESSAGE,
data: {
......@@ -57,11 +64,8 @@ describe('diagnostics jobs', () => {
await destroyAllChannels()
await createChannels()
readSock = await app.socketPool.acquire()
writeStub = sinon.stub(socket, 'write').callsFake(writeToReadSock)
const genUuidStub = sinon.stub(util, 'genUuid')
uuids.forEach((uuid, idx) => genUuidStub.onCall(idx).returns(uuid))
//
await sendHealthchecks()
})
afterEach(async () => {
await destroyAllChannels()
......@@ -69,22 +73,111 @@ describe('diagnostics jobs', () => {
sinon.restore()
})
it('sends a a healthcheck to every channel and gets a response', () => {
const messages = times(2 * channels.length, n => writeStub.getCall(n).args[0])
expect(messages).to.have.deep.members([
...channels.map((channel, idx) => ({
messageBody: `healthcheck ${uuids[idx]}`,
recipientAddress: { number: channels[idx].phoneNumber },
type: messageTypes.SEND,
username: diagnosticsPhoneNumber,
})),
...channels.map((channel, idx) => ({
messageBody: `healthcheck_response ${uuids[idx]}`,
recipientAddress: { number: diagnosticsPhoneNumber },
type: messageTypes.SEND,
username: channels[idx].phoneNumber,
})),
])
describe('in a healthy state', () => {
beforeEach(async () => {
writeStub = sinon.stub(socket, 'write').callsFake(echoBackToSocket)
await sendHealthchecks()
})
it('sends a a healthcheck to every channel and gets a response', () => {
expect(writeStub.callCount).to.eql(2 * channels.length)
expect(flatten(map(writeStub.getCalls(), 'args'))).to.have.deep.members([
...channels.map((channel, idx) => ({
messageBody: `healthcheck ${uuids[idx]}`,
recipientAddress: { number: channels[idx].phoneNumber },
type: messageTypes.SEND,
username: diagnosticsPhoneNumber,
})),
...channels.map((channel, idx) => ({
messageBody: `healthcheck_response ${uuids[idx]}`,
recipientAddress: { number: diagnosticsPhoneNumber },
type: messageTypes.SEND,
username: channels[idx].phoneNumber,
})),
])
})
})
describe('when a healtcheck fails for the first time', () => {
beforeEach(async () => {
writeStub = sinon.stub(socket, 'write').returns(Promise.resolve(''))
await sendHealthchecks()
await util.wait(healthcheckTimeout)
})
it('does not alert maintainers', () => {
expect(writeStub.callCount).to.eql(channels.length)
})
it('caches healthcheck failures', () => {
expect(failedHealthchecks.size).to.eql(channels.length)
})
})
describe('when a healtcheck fails twice in a row', () => {
beforeEach(async function() {
this.timeout(8000)
writeStub = sinon.stub(socket, 'write').returns(Promise.resolve(''))
await sendHealthchecks()
await util.wait(healthcheckTimeout)
await sendHealthchecks()
await util.wait(2 * healthcheckTimeout)
})
it('notifies maintainers', async function() {
const numHealtchecks = 2 * channels.length
const numAlerts = 2 * channels.length // 2 alerts per channel, since diagnostics has 2 admins
expect(writeStub.callCount).to.eql(numHealtchecks + numAlerts)
expect(flatten(map(writeStub.getCalls().slice(-numAlerts), 'args'))).to.have.deep.members([
{
messageBody: `Channel ${channels[0].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[0].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
{
messageBody: `Channel ${channels[0].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[1].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
{
messageBody: `Channel ${channels[1].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[0].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
{
messageBody: `Channel ${channels[1].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[1].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
{
messageBody: `Channel ${channels[2].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[0].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
{
messageBody: `Channel ${channels[2].phoneNumber} failed to respond to healthcheck`,
recipientAddress: {
number: diagnosticsChannel.memberships[1].memberPhoneNumber,
},
type: 'send',
username: diagnosticsPhoneNumber,
},
])
})
})
})
})
......@@ -43,6 +43,7 @@ const socketPoolResource = async () => {
const metricsResource = () => {
const counterStub = { labels: () => ({ inc: () => null }) }
const histogramStub = { labels: () => ({ observe: () => null }) }
const gaugueStub = { labels: () => ({ set: () => null }) }
return {
run: () => ({
register: {
......@@ -57,6 +58,9 @@ const metricsResource = () => {
histograms: {
MESSAGE_ROUNDTRIP: histogramStub,
},
gauges: {
CHANNEL_HEALTH: gaugueStub,
},
}),
}
}
......
......@@ -4,8 +4,9 @@ import sinon from 'sinon'
import channelRepository from '../../app/db/repositories/channel'
import metrics, { gauges } from '../../app/metrics'
import signal from '../../app/signal'
import { times, zip } from 'lodash'
import { respondToHealthcheck, sendHealthchecks } from '../../app/diagnostics'
import notifier from '../../app/notifier'
import { times, zip, flatten, map } from 'lodash'
import { respondToHealthcheck, sendHealthchecks, failedHealthchecks } from '../../app/diagnostics'
import { channelFactory, deepChannelFactory } from '../support/factories/channel'
import { sdMessageOf } from '../../app/signal/constants'
const {
......@@ -16,23 +17,26 @@ describe('diagnostics module', () => {
const channels = times(3, channelFactory)
const channelPhoneNumbers = channels.map(ch => ch.phoneNumber)
const diagnosticsChannel = deepChannelFactory({ phoneNumber: diagnosticsPhoneNumber })
const sysadminPhoneNumbers = channelRepository.getAdminPhoneNumbers(diagnosticsChannel)
const responseTimes = [-1, 1, 2]
let setGaugeStub, sendMessageStub, broadcastMessageStub, healthcheckStub
const responseTimes = [-1, -1, 2]
let setGaugeStub, sendMessageStub, notifyMaintainersStub, healthcheckStub
beforeEach(() => {
failedHealthchecks.add(channels[0].phoneNumber)
sinon
.stub(channelRepository, 'findAll')
.returns(Promise.resolve([...channels, diagnosticsChannel]))
sinon.stub(channelRepository, 'findDeep').returns(Promise.resolve(diagnosticsChannel))
setGaugeStub = sinon.stub(metrics, 'setGauge').returns(Promise.resolve())
sendMessageStub = sinon.stub(signal, 'sendMessage').returns(Promise.resolve(42))
broadcastMessageStub = sinon
.stub(signal, 'broadcastMessage')
.returns(Promise.resolve([1, 2, 3]))
notifyMaintainersStub = sinon
.stub(notifier, 'notifyMaintainers')
.returns(Promise.resolve(['1', '2', '3']))
healthcheckStub = sinon.stub(signal, 'healthcheck')
})
afterEach(() => sinon.restore())
afterEach(() => {
failedHealthchecks.clear()
sinon.restore()
})
describe('sending a healthcheck', () => {
beforeEach(async () => {
......@@ -58,14 +62,14 @@ describe('diagnostics module', () => {
})
})
it("sends an alert to admins if any channels don't respond", () => {
expect(broadcastMessageStub.callCount).to.eql(1)
expect(broadcastMessageStub.getCall(0).args).to.eql([
sysadminPhoneNumbers,
sdMessageOf(
{ phoneNumber: diagnosticsPhoneNumber },
`Channel ${channelPhoneNumbers[0]} failed to respond to healthcheck`,
),
it('caches when channel fails first healtcheck', () => {
expect(failedHealthchecks).to.contain(channels[1].phoneNumber)
})
it('alerts maintainers when a channel fails its second consecutive healthcheck', () => {
// note: channels[1] also failed healtcheck, but we don't alert about it
expect(flatten(map(notifyMaintainersStub.getCalls(), 'args'))).to.eql([
`Channel ${channelPhoneNumbers[0]} failed to respond to healthcheck`,
])
})
})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment