vmalert_rules: groups: - name: default alert rules: - alert: DiskUsage expr: >- sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df!="var-log"}) by (instance, df) > .75 or sum(collectd_df_df_complex{type!="free"}) by (instance, df) / sum(collectd_df_df_complex{df="var-log"}) by (instance, df) > .95 for: 2h - alert: TheWebsiteIsDown expr: >- probe_success{job="websites"} == 0 for: 10m - alert: Missing Metrics expr: >- up{instance!~"vmhost.*"} == 0 for: 10m - alert: NUT is offline expr: >- absent(collectd_nut_percent) - name: Bitwarden rules: - alert: vaultwarden is not running expr: >- collectd_processes_ps_count_processes{processes="vaultwarden"} < 1 for: 5m - name: Active Directory rules: - alert: samba is not running expr: >- collectd_processes_ps_count_processes{processes=~"samba|smbd|winbindd|krb5kdc"} < 1 for: 5m - name: Graylog rules: - alert: unprocessed messages expr: >- org_graylog2_journal_entries_uncommitted > 100 for: 1h - name: mdraid rules: - alert: mdraid missing disk expr: collectd_md_md_disks{type="missing"} != 0 - alert: mdraid failed disk expr: collectd_md_md_disks{type="failed"} != 0