Alerts

CriticalCPULoad (0 active)
alert: CriticalCPULoad
expr: 100
  - (avg by(instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))
  * 100) > 96
for: 2m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load
    for more than 2 minutes.'
  summary: Instance {{ $labels.instance }} - Critical CPU load
CriticalDiskSpace (0 active)
alert: CriticalDiskSpace
expr: node_filesystem_free_bytes{fstype!~"(squashfs|fuse.*)",job="node",mountpoint!~"^/run(/.*|$)"}
  / node_filesystem_size_bytes{job="node"} < 0.1
for: 4m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has less than 10%
    space remaining.'
  summary: Instance {{ $labels.instance }} - Critical disk space usage
CriticalRAMUsage (0 active)
alert: CriticalRAMUsage
expr: (1
  - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)
  / node_memory_MemTotal_bytes)) * 100 > 98
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.'
  summary: Instance {{ $labels.instance }} has Critical Memory Usage
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 5 minutes.'
  summary: Instance {{ $labels.instance }} down
RebootRequired (0 active)
alert: RebootRequired
expr: node_reboot_required
  > 0
labels:
  severity: warning
annotations:
  description: '{{ $labels.instance }} requires a reboot.'
  summary: Instance {{ $labels.instance }} - reboot required