mirror of
https://github.com/openimsdk/open-im-server.git
synced 2026-04-28 06:19:20 +08:00
feat: enhance configuration files with detailed comments for clarity
This commit is contained in:
+22
-20
@@ -1,34 +1,36 @@
|
||||
# Global Alertmanager runtime and SMTP settings.
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
smtp_from: alert@openim.io
|
||||
smtp_smarthost: smtp.163.com:465
|
||||
smtp_auth_username: alert@openim.io
|
||||
smtp_auth_password: YOURAUTHPASSWORD
|
||||
smtp_require_tls: false
|
||||
smtp_hello: xxx
|
||||
resolve_timeout: 5m # Wait time before an alert is considered resolved when no further updates are received.
|
||||
smtp_from: alert@openim.io # Sender address displayed in alert emails.
|
||||
smtp_smarthost: smtp.163.com:465 # SMTP relay endpoint in host:port format.
|
||||
smtp_auth_username: alert@openim.io # SMTP authentication username (commonly the same as smtp_from).
|
||||
smtp_auth_password: YOURAUTHPASSWORD # SMTP authorization token or app password.
|
||||
smtp_require_tls: false # Set to true when your SMTP provider requires STARTTLS.
|
||||
smtp_hello: xxx # HELO/EHLO identity presented to the SMTP server.
|
||||
|
||||
templates:
|
||||
- /etc/alertmanager/email.tmpl
|
||||
- /etc/alertmanager/email.tmpl # Go template file used to render HTML email content.
|
||||
|
||||
# Root routing tree for all incoming alerts.
|
||||
route:
|
||||
group_by: [ 'alertname' ]
|
||||
group_wait: 5s
|
||||
group_interval: 5s
|
||||
repeat_interval: 5m
|
||||
receiver: email
|
||||
group_by: [ 'alertname' ] # Alerts sharing this label value are batched into one notification.
|
||||
group_wait: 5s # Initial delay before sending the first notification for a new alert group.
|
||||
group_interval: 5s # Minimum interval between notifications for the same alert group.
|
||||
repeat_interval: 5m # Reminder interval while an alert group remains firing.
|
||||
receiver: email # Default receiver when no child route matches.
|
||||
routes:
|
||||
- matchers:
|
||||
- alertname = "XXX"
|
||||
group_by: [ 'instance' ]
|
||||
- alertname = "XXX" # Example matcher; replace with a real alert name or remove this route.
|
||||
group_by: [ 'instance' ] # Override grouping for this specific route.
|
||||
group_wait: 5s
|
||||
group_interval: 5s
|
||||
repeat_interval: 5m
|
||||
receiver: email
|
||||
|
||||
receivers:
|
||||
- name: email
|
||||
- name: email # Receiver name referenced by route.receiver.
|
||||
email_configs:
|
||||
- to: 'alert@example.com'
|
||||
html: '{{ template "email.to.html" . }}'
|
||||
headers: { Subject: "[OPENIM-SERVER]Alarm" }
|
||||
send_resolved: true
|
||||
- to: 'alert@example.com' # Recipient mailbox for alert notifications.
|
||||
html: '{{ template "email.to.html" . }}' # Rendered with the template declared in email.tmpl.
|
||||
headers: { Subject: "[OPENIM-SERVER]Alarm" } # Custom email subject line.
|
||||
send_resolved: true # Also send a notification when the alert recovers.
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
{{/* OpenIM Alertmanager email template.
|
||||
This template renders both firing and resolved alerts.
|
||||
Each alert entry reads labels and annotations from Prometheus rule definitions. */}}
|
||||
{{ define "email.to.html" }}
|
||||
{{ if eq .Status "firing" }}
|
||||
{{ range .Alerts }}
|
||||
|
||||
@@ -1,30 +1,31 @@
|
||||
# Default Prometheus alert groups for OpenIM.
|
||||
groups:
|
||||
- name: instance_down
|
||||
- name: instance_down # Fires when a monitored target remains unreachable.
|
||||
rules:
|
||||
- alert: InstanceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
expr: up == 0 # The built-in "up" metric is 0 when the latest scrape fails.
|
||||
for: 1m # Trigger only if the condition remains true for more than 1 minute.
|
||||
labels:
|
||||
severity: critical
|
||||
severity: critical # Used by Alertmanager for routing and notification priority.
|
||||
annotations:
|
||||
summary: "Instance {{ $labels.instance }} down"
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
|
||||
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute."
|
||||
|
||||
- name: database_insert_failure_alerts
|
||||
- name: database_insert_failure_alerts # Detects failures when persisting messages to Redis or MongoDB.
|
||||
rules:
|
||||
- alert: DatabaseInsertFailed
|
||||
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0)
|
||||
for: 1m
|
||||
expr: (increase(msg_insert_redis_failed_total[5m]) > 0) or (increase(msg_insert_mongo_failed_total[5m]) > 0) # Any positive increase indicates write failures occurred in the last 5 minutes.
|
||||
for: 1m # Avoid firing on very short spikes.
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Increase in MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter detected"
|
||||
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter has increased in the last 5 minutes, indicating failures in message insert operations to Redis or MongoDB,maybe the redis or mongodb is crash."
|
||||
description: "Either MsgInsertRedisFailedCounter or MsgInsertMongoFailedCounter increased in the last 5 minutes, indicating message insert failures to Redis or MongoDB and a possible backend outage."
|
||||
|
||||
- name: registrations_few
|
||||
- name: registrations_few # Operational early-warning rule for unusually low login/registration activity.
|
||||
rules:
|
||||
- alert: RegistrationsFew
|
||||
expr: increase(user_login_total[1h]) == 0
|
||||
expr: increase(user_login_total[1h]) == 0 # No successful login/registration events observed in 1 hour.
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
@@ -32,10 +33,10 @@ groups:
|
||||
summary: "Too few registrations within the time frame"
|
||||
description: "The number of registrations in the last hour is 0. There might be some issues."
|
||||
|
||||
- name: messages_few
|
||||
- name: messages_few # Operational early-warning rule for unusually low messaging activity.
|
||||
rules:
|
||||
- alert: MessagesFew
|
||||
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0
|
||||
expr: (increase(single_chat_msg_process_success_total[1h])+increase(group_chat_msg_process_success_total[1h])) == 0 # No successful single or group messages observed in 1 hour.
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
|
||||
@@ -8,7 +8,7 @@ api:
|
||||
|
||||
|
||||
prometheus:
|
||||
# Whether to enable prometheus
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# autoSetPorts indicates whether to automatically set the ports
|
||||
autoSetPorts: true
|
||||
|
||||
@@ -8,7 +8,7 @@ rpc:
|
||||
ports: [ 10140, 10141, 10142, 10143, 10144, 10145, 10146, 10147, 10148, 10149, 10150, 10151, 10152, 10153, 10154, 10155 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# autoSetPorts indicates whether to automatically set the ports
|
||||
autoSetPorts: true
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10170, 10171, 10172, 10173, 10174, 10175, 10176, 10177, 10178, 10179, 10180, 10181, 10182, 10183, 10184, 10185 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10200 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10220 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10240 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10260 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10280 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10300 ]
|
||||
|
||||
prometheus:
|
||||
# Enable or disable Prometheus monitoring
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# List of ports that Prometheus listens on; these must match the number of rpc.ports to ensure correct monitoring setup
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
@@ -10,7 +10,7 @@ rpc:
|
||||
ports: [ 10320 ]
|
||||
|
||||
prometheus:
|
||||
# Whether to enable prometheus
|
||||
# Enable Prometheus metrics exposure for this service; set to true to allow scraping.
|
||||
enable: true
|
||||
# Prometheus listening ports, must be consistent with the number of rpc.ports
|
||||
# It will only take effect when autoSetPorts is set to false.
|
||||
|
||||
+24
-25
@@ -1,35 +1,34 @@
|
||||
# my global config
|
||||
# Global Prometheus runtime settings.
|
||||
global:
|
||||
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
|
||||
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
|
||||
# scrape_timeout is set to the global default (10s).
|
||||
# scrape_timeout defaults to 10s unless overridden in a specific scrape job.
|
||||
|
||||
# Alertmanager configuration
|
||||
# Alertmanager endpoints that receive alert events from Prometheus.
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets: [127.0.0.1:19093]
|
||||
- targets: [127.0.0.1:19093] # Alertmanager address in host:port format.
|
||||
|
||||
# Load rules once and periodically evaluate them according to the global evaluation_interval.
|
||||
# Rule files loaded by Prometheus.
|
||||
rule_files:
|
||||
- instance-down-rules.yml
|
||||
- instance-down-rules.yml # Default OpenIM alert rules; add more files here if needed.
|
||||
# - first_rules.yml
|
||||
# - second_rules.yml
|
||||
|
||||
# A scrape configuration containing exactly one endpoint to scrape:
|
||||
# Here it's Prometheus itself.
|
||||
# Scrape jobs used to collect infrastructure and OpenIM service metrics.
|
||||
scrape_configs:
|
||||
# The job name is added as a label "job=job_name" to any timeseries scraped from this config.
|
||||
# Monitored information captured by prometheus
|
||||
|
||||
# prometheus fetches application services
|
||||
# The job_name value is attached as the "job" label in collected time series.
|
||||
- job_name: node_exporter
|
||||
static_configs:
|
||||
- targets: [ 127.0.0.1:19100 ]
|
||||
- targets: [ 127.0.0.1:19100 ] # node_exporter endpoint for host CPU, memory, disk, and network metrics.
|
||||
|
||||
# OpenIM services are discovered dynamically from the admin API.
|
||||
# For multi-host deployments, replace 127.0.0.1 with a reachable internal address.
|
||||
|
||||
- job_name: openimserver-openim-api
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/api"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/api" # Service discovery endpoint for OpenIM API instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12002 ]
|
||||
# labels:
|
||||
@@ -37,7 +36,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-msggateway
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_gateway" # Service discovery endpoint for msggateway instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12140 ]
|
||||
# # - targets: [ 127.0.0.1:12140, 127.0.0.1:12141, 127.0.0.1:12142, 127.0.0.1:12143, 127.0.0.1:12144, 127.0.0.1:12145, 127.0.0.1:12146, 127.0.0.1:12147, 127.0.0.1:12148, 127.0.0.1:12149, 127.0.0.1:12150, 127.0.0.1:12151, 127.0.0.1:12152, 127.0.0.1:12153, 127.0.0.1:12154, 127.0.0.1:12155 ]
|
||||
@@ -46,7 +45,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-msgtransfer
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg_transfer" # Service discovery endpoint for msgtransfer instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027 ]
|
||||
# # - targets: [ 127.0.0.1:12020, 127.0.0.1:12021, 127.0.0.1:12022, 127.0.0.1:12023, 127.0.0.1:12024, 127.0.0.1:12025, 127.0.0.1:12026, 127.0.0.1:12027, 127.0.0.1:12028, 127.0.0.1:12029, 127.0.0.1:12030, 127.0.0.1:12031, 127.0.0.1:12032, 127.0.0.1:12033, 127.0.0.1:12034, 127.0.0.1:12035 ]
|
||||
@@ -55,7 +54,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-push
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/push"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/push" # Service discovery endpoint for push service instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177 ]
|
||||
## - targets: [ 127.0.0.1:12170, 127.0.0.1:12171, 127.0.0.1:12172, 127.0.0.1:12173, 127.0.0.1:12174, 127.0.0.1:12175, 127.0.0.1:12176, 127.0.0.1:12177, 127.0.0.1:12178, 127.0.0.1:12179, 127.0.0.1:12180, 127.0.0.1:12182, 127.0.0.1:12183, 127.0.0.1:12184, 127.0.0.1:12185, 127.0.0.1:12186 ]
|
||||
@@ -64,7 +63,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-auth
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/auth"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/auth" # Service discovery endpoint for auth RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12200 ]
|
||||
# labels:
|
||||
@@ -72,7 +71,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-conversation
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/conversation"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/conversation" # Service discovery endpoint for conversation RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12220 ]
|
||||
# labels:
|
||||
@@ -80,7 +79,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-friend
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/friend"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/friend" # Service discovery endpoint for friend RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12240 ]
|
||||
# labels:
|
||||
@@ -88,7 +87,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-group
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/group"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/group" # Service discovery endpoint for group RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12260 ]
|
||||
# labels:
|
||||
@@ -96,7 +95,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-msg
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/msg" # Service discovery endpoint for msg RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12280 ]
|
||||
# labels:
|
||||
@@ -104,7 +103,7 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-third
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/third"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/third" # Service discovery endpoint for third-party RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12300 ]
|
||||
# labels:
|
||||
@@ -112,8 +111,8 @@ scrape_configs:
|
||||
|
||||
- job_name: openimserver-openim-rpc-user
|
||||
http_sd_configs:
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/user"
|
||||
- url: "http://127.0.0.1:10002/prometheus_discovery/user" # Service discovery endpoint for user RPC instances.
|
||||
# static_configs:
|
||||
# - targets: [ 127.0.0.1:12320 ]
|
||||
# labels:
|
||||
# namespace: default
|
||||
# namespace: default
|
||||
|
||||
Reference in New Issue
Block a user