# Prometheus 告警规则 # 资金服务平台 groups: # ==================== 服务可用性告警 ==================== - name: service_availability rules: # 服务宕机告警 - alert: ServiceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "服务 {{ $labels.job }} 宕机" description: "服务 {{ $labels.instance }} 已经宕机超过 1 分钟" # 健康检查失败 - alert: HealthCheckFailed expr: spring_boot_health_status{status="DOWN"} == 1 for: 30s labels: severity: warning annotations: summary: "服务健康检查失败" description: "服务 {{ $labels.application }} 健康状态为 DOWN" # ==================== JVM 内存告警 ==================== - name: jvm_memory rules: # 堆内存使用率过高 - alert: HeapMemoryUsageHigh expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "JVM 堆内存使用率过高" description: "服务 {{ $labels.application }} 堆内存使用率 {{ $value | printf \"%.2f\" }}%" # 堆内存即将耗尽 - alert: HeapMemoryCritical expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 95 for: 1m labels: severity: critical annotations: summary: "JVM 堆内存即将耗尽" description: "服务 {{ $labels.application }} 堆内存使用率 {{ $value | printf \"%.2f\" }}%,请立即处理" # GC 时间过长 - alert: GCTimeTooHigh expr: rate(jvm_gc_pause_seconds_sum[5m]) > 0.5 for: 5m labels: severity: warning annotations: summary: "GC 时间过长" description: "服务 {{ $labels.application }} GC 时间占比过高,可能影响性能" # ==================== HTTP 请求告警 ==================== - name: http_requests rules: # 高错误率 - alert: HighErrorRate expr: sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application) / sum(rate(http_server_requests_seconds_count[5m])) by (application) > 0.05 for: 5m labels: severity: warning annotations: summary: "HTTP 请求错误率过高" description: "服务 {{ $labels.application }} 5xx 错误率 {{ $value | printf \"%.2f\" }}%" # 响应时间过长 - alert: HighResponseTime expr: histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[5m])) by (le, application)) > 2 for: 5m labels: severity: warning annotations: summary: "HTTP 响应时间过长" description: "服务 {{ $labels.application }} P95 响应时间 {{ $value | printf \"%.2f\" }} 秒" # ==================== 数据库连接池告警 ==================== - name: database_connections rules: # HikariCP 连接池使用率过高 - alert: HikariPoolUsageHigh expr: (hikaricp_connections_active / hikaricp_connections_max) * 100 > 80 for: 5m labels: severity: warning annotations: summary: "数据库连接池使用率过高" description: "服务 {{ $labels.application }} 连接池使用率 {{ $value | printf \"%.2f\" }}%" # 连接池等待 - alert: HikariPoolPending expr: hikaricp_connections_pending > 0 for: 2m labels: severity: warning annotations: summary: "数据库连接池存在等待" description: "服务 {{ $labels.application }} 有 {{ $value }} 个连接请求在等待" # ==================== 系统资源告警 ==================== - name: system_resources rules: # CPU 使用率过高 - alert: HighCPUUsage expr: system_cpu_usage * 100 > 80 for: 5m labels: severity: warning annotations: summary: "CPU 使用率过高" description: "服务 {{ $labels.application }} CPU 使用率 {{ $value | printf \"%.2f\" }}%" # 进程打开文件描述符过多 - alert: HighFileDescriptorUsage expr: process_files_open_files / process_files_max_files * 100 > 80 for: 5m labels: severity: warning annotations: summary: "文件描述符使用率过高" description: "服务 {{ $labels.application }} 文件描述符使用率 {{ $value | printf \"%.2f\" }}%"