feat: enhance configuration files with detailed comments for clarity

2026-03-20 18:24:51 +08:00
parent 5028624fa3
commit 801ac740b7
15 changed files with 74 additions and 69 deletions
@@ -1,34 +1,36 @@
+# Global Alertmanager runtime and SMTP settings.
 global:
-  resolve_timeout: 5m
-  smtp_from: alert@openim.io
-  smtp_smarthost: smtp.163.com:465
-  smtp_auth_username: alert@openim.io
-  smtp_auth_password: YOURAUTHPASSWORD
-  smtp_require_tls: false
-  smtp_hello: xxx
+  resolve_timeout: 5m # Wait time before an alert is considered resolved when no further updates are received.
+  smtp_from: alert@openim.io # Sender address displayed in alert emails.
+  smtp_smarthost: smtp.163.com:465 # SMTP relay endpoint in host:port format.
+  smtp_auth_username: alert@openim.io # SMTP authentication username (commonly the same as smtp_from).
+  smtp_auth_password: YOURAUTHPASSWORD # SMTP authorization token or app password.
+  smtp_require_tls: false # Set to true when your SMTP provider requires STARTTLS.
+  smtp_hello: xxx # HELO/EHLO identity presented to the SMTP server.

 templates:
-  - /etc/alertmanager/email.tmpl
+  - /etc/alertmanager/email.tmpl # Go template file used to render HTML email content.

+# Root routing tree for all incoming alerts.
 route:
-  group_by: [ 'alertname' ]
-  group_wait: 5s
-  group_interval: 5s
-  repeat_interval: 5m
-  receiver: email
+  group_by: [ 'alertname' ] # Alerts sharing this label value are batched into one notification.
+  group_wait: 5s # Initial delay before sending the first notification for a new alert group.
+  group_interval: 5s # Minimum interval between notifications for the same alert group.
+  repeat_interval: 5m # Reminder interval while an alert group remains firing.
+  receiver: email # Default receiver when no child route matches.
  routes:
    - matchers:
-        - alertname = "XXX"
-      group_by: [ 'instance' ]
+        - alertname = "XXX" # Example matcher; replace with a real alert name or remove this route.
+      group_by: [ 'instance' ] # Override grouping for this specific route.
      group_wait: 5s
      group_interval: 5s
      repeat_interval: 5m
      receiver: email

 receivers:
-  - name: email
+  - name: email # Receiver name referenced by route.receiver.
    email_configs:
-      - to: 'alert@example.com'
-        html: '{{ template "email.to.html" . }}'
-        headers: { Subject: "[OPENIM-SERVER]Alarm" }
-        send_resolved: true
+      - to: 'alert@example.com' # Recipient mailbox for alert notifications.
+        html: '{{ template "email.to.html" . }}' # Rendered with the template declared in email.tmpl.
+        headers: { Subject: "[OPENIM-SERVER]Alarm" } # Custom email subject line.
+        send_resolved: true # Also send a notification when the alert recovers.
@@ -1,3 +1,6 @@
+{{/* OpenIM Alertmanager email template.
+This template renders both firing and resolved alerts.
+Each alert entry reads labels and annotations from Prometheus rule definitions. */}}
 {{ define "email.to.html" }}
 {{ if eq .Status "firing" }}
    {{ range .Alerts }}
@@ -1,30 +1,31 @@
+# Default Prometheus alert groups for OpenIM.
 groups:
-  - name: instance_down
+  - name: instance_down # Fires when a monitored target remains unreachable.
    rules:
      - alert: InstanceDown
-        expr: up == 0
-        for: 1m
+        expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
+        for: 1m # Trigger only if the condition remains true for more than 1 minute.
        labels:
-          severity: critical
+          severity: critical # Used by Alertmanager for routing and notification priority.
        annotations:
          summary: "Instance {{ $labels.instance }} down"
-          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
+          description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."

-  - name: database_insert_failure_alerts
+  - name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
    rules:
      - alert: DatabaseInsertFailed
-        expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
-        for: 1m
+        expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
+        for: 1m # Avoid firing on very short spikes.
        labels:
          severity: critical
        annotations:
          summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
-          description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
+          description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."

-  - name: registrations_few
+  - name: registrations_few # Operational early-warning rule for unusually low login/registration activity.
    rules:
      - alert: RegistrationsFew
-        expr: increase(user_login_total[1h]) == 0
+        expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
        for: 1m
        labels:
          severity: info
@@ -32,10 +33,10 @@ groups:
          summary: "Too few registrations within the time frame"
          description: "The number of registrations in the last hour is 0. There might be some issues."

-  - name: messages_few
+  - name: messages_few # Operational early-warning rule for unusually low messaging activity.
    rules:
      - alert: MessagesFew
-        expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0
+        expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
        for: 1m
        labels:
          severity: info
@@ -8,7 +8,7 @@ api:


 prometheus:
-  # Whether to enable prometheus
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # autoSetPorts indicates whether to automatically set the ports
  autoSetPorts: true
@@ -8,7 +8,7 @@ rpc:
  ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -1,5 +1,5 @@
 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # autoSetPorts indicates whether to automatically set the ports
  autoSetPorts: true
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10200 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10220 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10240 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10260 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10280 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10300 ]

 prometheus:
-  # Enable or disable Prometheus monitoring
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
  # It will only take effect when autoSetPorts is set to false.
@@ -10,7 +10,7 @@ rpc:
  ports: [ 10320 ]

 prometheus:
-  # Whether to enable prometheus
+  # Enable Prometheus metrics exposure for this service; set to true to allow scraping.
  enable: true
  # Prometheus listening ports, must be consistent with the number of rpc.ports
  # It will only take effect when autoSetPorts is set to false.
@@ -1,35 +1,34 @@
-# my global config
+# Global Prometheus runtime settings.
 global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
-  # scrape_timeout is set to the global default (10s).
+  # scrape_timeout defaults to 10s unless overridden in a specific scrape job.

-# Alertmanager configuration
+# Alertmanager endpoints that receive alert events from Prometheus.
 alerting:
  alertmanagers:
    - static_configs:
-        - targets: [127.0.0.1:19093]
+        - targets: [127.0.0.1:19093] # Alertmanager address in host:port format.

-# Load rules once and periodically evaluate them according to the global evaluation_interval.
+# Rule files loaded by Prometheus.
 rule_files:
-  - instance-down-rules.yml
+  - instance-down-rules.yml # Default OpenIM alert rules; add more files here if needed.
 # - first_rules.yml
 # - second_rules.yml

-# A scrape configuration containing exactly one endpoint to scrape:
-# Here it's Prometheus itself.
+# Scrape jobs used to collect infrastructure and OpenIM service metrics.
 scrape_configs:
-  # The job name is added as a label "job=job_name" to any timeseries scraped from this config.
-  # Monitored information captured by prometheus
-
-  # prometheus fetches application services
+  # The job_name value is attached as the "job" label in collected time series.
  - job_name: node_exporter
    static_configs:
-      - targets: [ 127.0.0.1:19100 ]
+      - targets: [ 127.0.0.1:19100 ] # node_exporter endpoint for host CPU, memory, disk, and network metrics.
+
+  # OpenIM services are discovered dynamically from the admin API.
+  # For multi-host deployments, replace 127.0.0.1 with a reachable internal address.

  - job_name: openimserver-openim-api
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/api"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/api" # Service discovery endpoint for OpenIM API instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12002 ]
 #        labels:
@@ -37,7 +36,7 @@ scrape_configs:

  - job_name: openimserver-openim-msggateway
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" # Service discovery endpoint for msggateway instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12140 ]
 #        #      - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ]
@@ -46,7 +45,7 @@ scrape_configs:

  - job_name: openimserver-openim-msgtransfer
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" # Service discovery endpoint for msgtransfer instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ]
 #        #      - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ]
@@ -55,7 +54,7 @@ scrape_configs:

  - job_name: openimserver-openim-push
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/push"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/push" # Service discovery endpoint for push service instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ]
 ##      - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180,  127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ]
@@ -64,7 +63,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-auth
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/auth"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/auth" # Service discovery endpoint for auth RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12200 ]
 #        labels:
@@ -72,7 +71,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-conversation
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/conversation"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/conversation" # Service discovery endpoint for conversation RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12220 ]
 #        labels:
@@ -80,7 +79,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-friend
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/friend"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/friend" # Service discovery endpoint for friend RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12240 ]
 #        labels:
@@ -88,7 +87,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-group
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/group"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/group" # Service discovery endpoint for group RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12260 ]
 #        labels:
@@ -96,7 +95,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-msg
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/msg"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/msg" # Service discovery endpoint for msg RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12280 ]
 #        labels:
@@ -104,7 +103,7 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-third
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/third"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/third" # Service discovery endpoint for third-party RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12300 ]
 #        labels:
@@ -112,8 +111,8 @@ scrape_configs:

  - job_name: openimserver-openim-rpc-user
    http_sd_configs:
-      - url: "http://127.0.0.1:10002/prometheus_discovery/user"
+      - url: "http://127.0.0.1:10002/prometheus_discovery/user" # Service discovery endpoint for user RPC instances.
 #    static_configs:
 #      - targets: [ 127.0.0.1:12320 ]
 #        labels:
-#          namespace: default
+#          namespace: default