Alerts

/etc/prometheus/rules/ansible_managed.rules > ansible managed alert rules
Watchdog (1 active)
alert: Watchdog
expr: vector(1)
for: 10m
labels:
  severity: warning
annotations:
  description: This is an alert meant to ensure that the entire alerting pipeline
    is functional. This alert is always firing, therefore it should always be firing
    in Alertmanager and always fire against a receiver. There are integrations with
    various notification mechanisms that send a notification when this alert is not
    firing. For example the "DeadMansSnitch" integration in PagerDuty.
  summary: Ensure entire alerting pipeline is functional
Labels State Active Since Value
alertname="Watchdog" severity="warning" firing 2019-05-27 18:48:44.318691705 +0000 UTC 1
ClockSkewDetected (0 active)
alert: ClockSkewDetected
expr: abs(node_timex_offset_seconds)
  * 1000 > 30
for: 2m
labels:
  severity: warning
annotations:
  description: Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured
    correctly on this host.
  summary: Instance {{ $labels.instance }} - Clock skew detected
CriticalCPULoad (0 active)
alert: CriticalCPULoad
expr: 100
  - (avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))
  * 100) > 96
for: 2m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load
    for more than 2 minutes.'
  summary: Instance {{ $labels.instance }} - Critical CPU load
CriticalDiskSpace (0 active)
alert: CriticalDiskSpace
expr: node_filesystem_free_bytes{fstype!~"(squashfs|fuse.*)",job="node",mountpoint!~"^/run(/.*|$)"}
  / node_filesystem_size_bytes{job="node"} < 0.1
for: 4m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has less than 10%
    space remaining.'
  summary: Instance {{ $labels.instance }} - Critical disk space usage
CriticalRAMUsage (0 active)
alert: CriticalRAMUsage
expr: (1
  - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)
  / node_memory_MemTotal_bytes)) * 100 > 98
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.'
  summary: Instance {{ $labels.instance }} has Critical Memory Usage
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down
RebootRequired (0 active)
alert: RebootRequired
expr: node_reboot_required
  > 0
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} requires a reboot.'
  summary: Instance {{ $labels.instance }} - reboot required