zhangjf 5b80e237b9 feat: Docker容器化部署和Prometheus+Grafana监控
## Docker 容器化部署

### 新增文件
- Dockerfile: 多阶段构建镜像,支持 Java 21
- docker-compose.yml: 完整服务编排配置
  - 基础设施: MySQL 8.0, Redis 7, Nacos 3.0
  - 监控: Prometheus, Grafana
  - 业务服务: Gateway + 9个微服务
- docker/.env: 环境变量配置
- docker/mysql/init/01-init.sql: 数据库初始化脚本

### Docker 特性
- 多阶段构建优化镜像大小
- 非 root 用户运行服务
- 健康检查配置
- 统一时区设置 (Asia/Shanghai)

## Prometheus + Grafana 监控

### Prometheus 配置
- docker/prometheus/prometheus.yml: 服务发现配置
- docker/prometheus/rules/alerts.yml: 告警规则
  - 服务可用性告警
  - JVM 内存告警
  - HTTP 请求告警
  - 数据库连接池告警
  - 系统资源告警

### Grafana 配置
- docker/grafana/provisioning/: 数据源和Dashboard自动导入
- docker/grafana/dashboards/fund-platform-dashboard.json
  - 服务概览面板
  - JVM 内存监控
  - 数据库连接池监控

### Spring Boot Actuator 集成
- pom.xml: 添加 spring-boot-starter-actuator 和 micrometer-registry-prometheus
- application-docker.yml: Prometheus 端点配置

## 服务端口规划
- Gateway: 8000
- fund-sys: 8100
- fund-cust: 8200
- fund-proj: 8300
- fund-req: 8400
- fund-exp: 8500
- fund-receipt: 8600
- fund-report: 8700
- fund-file: 8800
- Prometheus: 9090
- Grafana: 3000
- Nacos: 8848
2026-02-19 18:48:15 +08:00

129 lines
4.6 KiB
YAML

# Prometheus 告警规则
# 资金服务平台
groups:
# ==================== 服务可用性告警 ====================
- name: service_availability
rules:
# 服务宕机告警
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "服务 {{ $labels.job }} 宕机"
description: "服务 {{ $labels.instance }} 已经宕机超过 1 分钟"
# 健康检查失败
- alert: HealthCheckFailed
expr: spring_boot_health_status{status="DOWN"} == 1
for: 30s
labels:
severity: warning
annotations:
summary: "服务健康检查失败"
description: "服务 {{ $labels.application }} 健康状态为 DOWN"
# ==================== JVM 内存告警 ====================
- name: jvm_memory
rules:
# 堆内存使用率过高
- alert: HeapMemoryUsageHigh
expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "JVM 堆内存使用率过高"
description: "服务 {{ $labels.application }} 堆内存使用率 {{ $value | printf \"%.2f\" }}%"
# 堆内存即将耗尽
- alert: HeapMemoryCritical
expr: (jvm_memory_used_bytes{area="heap"} / jvm_memory_max_bytes{area="heap"}) * 100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "JVM 堆内存即将耗尽"
description: "服务 {{ $labels.application }} 堆内存使用率 {{ $value | printf \"%.2f\" }}%,请立即处理"
# GC 时间过长
- alert: GCTimeTooHigh
expr: rate(jvm_gc_pause_seconds_sum[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: "GC 时间过长"
description: "服务 {{ $labels.application }} GC 时间占比过高,可能影响性能"
# ==================== HTTP 请求告警 ====================
- name: http_requests
rules:
# 高错误率
- alert: HighErrorRate
expr: sum(rate(http_server_requests_seconds_count{status=~"5.."}[5m])) by (application) / sum(rate(http_server_requests_seconds_count[5m])) by (application) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP 请求错误率过高"
description: "服务 {{ $labels.application }} 5xx 错误率 {{ $value | printf \"%.2f\" }}%"
# 响应时间过长
- alert: HighResponseTime
expr: histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket[5m])) by (le, application)) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "HTTP 响应时间过长"
description: "服务 {{ $labels.application }} P95 响应时间 {{ $value | printf \"%.2f\" }} 秒"
# ==================== 数据库连接池告警 ====================
- name: database_connections
rules:
# HikariCP 连接池使用率过高
- alert: HikariPoolUsageHigh
expr: (hikaricp_connections_active / hikaricp_connections_max) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "数据库连接池使用率过高"
description: "服务 {{ $labels.application }} 连接池使用率 {{ $value | printf \"%.2f\" }}%"
# 连接池等待
- alert: HikariPoolPending
expr: hikaricp_connections_pending > 0
for: 2m
labels:
severity: warning
annotations:
summary: "数据库连接池存在等待"
description: "服务 {{ $labels.application }} 有 {{ $value }} 个连接请求在等待"
# ==================== 系统资源告警 ====================
- name: system_resources
rules:
# CPU 使用率过高
- alert: HighCPUUsage
expr: system_cpu_usage * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "CPU 使用率过高"
description: "服务 {{ $labels.application }} CPU 使用率 {{ $value | printf \"%.2f\" }}%"
# 进程打开文件描述符过多
- alert: HighFileDescriptorUsage
expr: process_files_open_files / process_files_max_files * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "文件描述符使用率过高"
description: "服务 {{ $labels.application }} 文件描述符使用率 {{ $value | printf \"%.2f\" }}%"