# # Copyright The CloudNativePG Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Default values for CloudNativePG. # This is a YAML-formatted file. # Please declare variables to be passed to your templates. replicaCount: 1 image: repository: ghcr.io/cloudnative-pg/cloudnative-pg pullPolicy: IfNotPresent # -- Overrides the image tag whose default is the chart appVersion. tag: "" imagePullSecrets: [] nameOverride: "" fullnameOverride: "" hostNetwork: false dnsPolicy: "" crds: # -- Specifies whether the CRDs should be created when installing the chart. create: true # -- The webhook configuration. webhook: port: 9443 mutating: create: true failurePolicy: Fail validating: create: true failurePolicy: Fail livenessProbe: initialDelaySeconds: 3 readinessProbe: initialDelaySeconds: 3 # -- Operator configuration. config: # -- Specifies whether the secret should be created. create: true # -- The name of the configmap/secret to use. name: cnpg-controller-manager-config # -- Specifies whether it should be stored in a secret, instead of a configmap. secret: false # -- The content of the configmap/secret, see # https://cloudnative-pg.io/documentation/current/operator_conf/#available-options # for all the available options. data: {} # INHERITED_ANNOTATIONS: categories # INHERITED_LABELS: environment, workload, app # WATCH_NAMESPACE: namespace-a,namespace-b # -- Additinal arguments to be added to the operator's args list. additionalArgs: [] # -- Array containing extra environment variables which can be templated. # For example: # - name: RELEASE_NAME # value: "{{ .Release.Name }}" # - name: MY_VAR # value: "mySpecialKey" additionalEnv: [] serviceAccount: # -- Specifies whether the service account should be created. create: true # -- The name of the service account to use. # If not set and create is true, a name is generated using the fullname template. name: "" rbac: # -- Specifies whether ClusterRole and ClusterRoleBinding should be created. create: true # -- Aggregate ClusterRoles to Kubernetes default user-facing roles. # Ref: https://kubernetes.io/docs/reference/access-authn-authz/rbac/#user-facing-roles aggregateClusterRoles: false # -- Annotations to be added to all other resources. commonAnnotations: {} # -- Annotations to be added to the pod. podAnnotations: {} # -- Labels to be added to the pod. podLabels: {} # -- Container Security Context. containerSecurityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: true runAsUser: 10001 runAsGroup: 10001 seccompProfile: type: RuntimeDefault capabilities: drop: - "ALL" # -- Security Context for the whole pod. podSecurityContext: runAsNonRoot: true seccompProfile: type: RuntimeDefault # fsGroup: 2000 # -- Priority indicates the importance of a Pod relative to other Pods. priorityClassName: "" service: type: ClusterIP # -- DO NOT CHANGE THE SERVICE NAME as it is currently used to generate the certificate # and can not be configured name: cnpg-webhook-service port: 443 resources: {} # If you want to specify resources, uncomment the following # lines, adjust them as necessary, and remove the curly braces after 'resources:'. # # limits: # cpu: 100m # memory: 200Mi # requests: # cpu: 100m # memory: 100Mi # -- Nodeselector for the operator to be installed. nodeSelector: {} # -- Tolerations for the operator to be installed. tolerations: [] # -- Affinity for the operator to be installed. affinity: {} monitoring: # -- Specifies whether the monitoring should be enabled. Requires Prometheus Operator CRDs. podMonitorEnabled: false # -- Metrics relabel configurations to apply to samples before ingestion. podMonitorMetricRelabelings: [] # -- Relabel configurations to apply to samples before scraping. podMonitorRelabelings: [] # -- Additional labels for the podMonitor podMonitorAdditionalLabels: {} grafanaDashboard: create: false # -- Allows overriding the namespace where the ConfigMap will be created, defaulting to the same one as the Release. namespace: "" # -- The name of the ConfigMap containing the dashboard. configMapName: "cnpg-grafana-dashboard" # -- Label that ConfigMaps should have to be loaded as dashboards. DEPRECATED: Use labels instead. sidecarLabel: "grafana_dashboard" # -- Label value that ConfigMaps should have to be loaded as dashboards. DEPRECATED: Use labels instead. sidecarLabelValue: "1" # -- Labels that ConfigMaps should have to get configured in Grafana. labels: {} # -- Annotations that ConfigMaps can have to get configured in Grafana. annotations: {} # Default monitoring queries monitoringQueriesConfigMap: # -- The name of the default monitoring configmap. name: cnpg-default-monitoring # -- A string representation of a YAML defining monitoring queries. queries: | backends: query: | SELECT sa.datname , sa.usename , sa.application_name , states.state , COALESCE(sa.count, 0) AS total , COALESCE(sa.max_tx_secs, 0) AS max_tx_duration_seconds FROM ( VALUES ('active') , ('idle') , ('idle in transaction') , ('idle in transaction (aborted)') , ('fastpath function call') , ('disabled') ) AS states(state) LEFT JOIN ( SELECT datname , state , usename , COALESCE(application_name, '') AS application_name , COUNT(*) , COALESCE(EXTRACT (EPOCH FROM (max(now() - xact_start))), 0) AS max_tx_secs FROM pg_catalog.pg_stat_activity GROUP BY datname, state, usename, application_name ) sa ON states.state = sa.state WHERE sa.usename IS NOT NULL metrics: - datname: usage: "LABEL" description: "Name of the database" - usename: usage: "LABEL" description: "Name of the user" - application_name: usage: "LABEL" description: "Name of the application" - state: usage: "LABEL" description: "State of the backend" - total: usage: "GAUGE" description: "Number of backends" - max_tx_duration_seconds: usage: "GAUGE" description: "Maximum duration of a transaction in seconds" backends_waiting: query: | SELECT count(*) AS total FROM pg_catalog.pg_locks blocked_locks JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid WHERE NOT blocked_locks.granted metrics: - total: usage: "GAUGE" description: "Total number of backends that are currently waiting on other queries" pg_database: query: | SELECT datname , pg_catalog.pg_database_size(datname) AS size_bytes , pg_catalog.age(datfrozenxid) AS xid_age , pg_catalog.mxid_age(datminmxid) AS mxid_age FROM pg_catalog.pg_database WHERE datallowconn metrics: - datname: usage: "LABEL" description: "Name of the database" - size_bytes: usage: "GAUGE" description: "Disk space used by the database" - xid_age: usage: "GAUGE" description: "Number of transactions from the frozen XID to the current one" - mxid_age: usage: "GAUGE" description: "Number of multiple transactions (Multixact) from the frozen XID to the current one" pg_postmaster: query: | SELECT EXTRACT(EPOCH FROM pg_postmaster_start_time) AS start_time FROM pg_catalog.pg_postmaster_start_time() metrics: - start_time: usage: "GAUGE" description: "Time at which postgres started (based on epoch)" pg_replication: query: "SELECT CASE WHEN ( NOT pg_catalog.pg_is_in_recovery() OR pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn()) THEN 0 ELSE GREATEST (0, EXTRACT(EPOCH FROM (now() - pg_catalog.pg_last_xact_replay_timestamp()))) END AS lag, pg_catalog.pg_is_in_recovery() AS in_recovery, EXISTS (TABLE pg_stat_wal_receiver) AS is_wal_receiver_up, (SELECT count(*) FROM pg_catalog.pg_stat_replication) AS streaming_replicas" metrics: - lag: usage: "GAUGE" description: "Replication lag behind primary in seconds" - in_recovery: usage: "GAUGE" description: "Whether the instance is in recovery" - is_wal_receiver_up: usage: "GAUGE" description: "Whether the instance wal_receiver is up" - streaming_replicas: usage: "GAUGE" description: "Number of streaming replicas connected to the instance" pg_replication_slots: query: | SELECT slot_name, slot_type, database, active, (CASE pg_catalog.pg_is_in_recovery() WHEN TRUE THEN pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), restart_lsn) ELSE pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), restart_lsn) END) as pg_wal_lsn_diff FROM pg_catalog.pg_replication_slots WHERE NOT temporary metrics: - slot_name: usage: "LABEL" description: "Name of the replication slot" - slot_type: usage: "LABEL" description: "Type of the replication slot" - database: usage: "LABEL" description: "Name of the database" - active: usage: "GAUGE" description: "Flag indicating whether the slot is active" - pg_wal_lsn_diff: usage: "GAUGE" description: "Replication lag in bytes" pg_stat_archiver: query: | SELECT archived_count , failed_count , COALESCE(EXTRACT(EPOCH FROM (now() - last_archived_time)), -1) AS seconds_since_last_archival , COALESCE(EXTRACT(EPOCH FROM (now() - last_failed_time)), -1) AS seconds_since_last_failure , COALESCE(EXTRACT(EPOCH FROM last_archived_time), -1) AS last_archived_time , COALESCE(EXTRACT(EPOCH FROM last_failed_time), -1) AS last_failed_time , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_archived_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_archived_wal_start_lsn , COALESCE(CAST(CAST('x'||pg_catalog.right(pg_catalog.split_part(last_failed_wal, '.', 1), 16) AS pg_catalog.bit(64)) AS pg_catalog.int8), -1) AS last_failed_wal_start_lsn , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time FROM pg_catalog.pg_stat_archiver metrics: - archived_count: usage: "COUNTER" description: "Number of WAL files that have been successfully archived" - failed_count: usage: "COUNTER" description: "Number of failed attempts for archiving WAL files" - seconds_since_last_archival: usage: "GAUGE" description: "Seconds since the last successful archival operation" - seconds_since_last_failure: usage: "GAUGE" description: "Seconds since the last failed archival operation" - last_archived_time: usage: "GAUGE" description: "Epoch of the last time WAL archiving succeeded" - last_failed_time: usage: "GAUGE" description: "Epoch of the last time WAL archiving failed" - last_archived_wal_start_lsn: usage: "GAUGE" description: "Archived WAL start LSN" - last_failed_wal_start_lsn: usage: "GAUGE" description: "Last failed WAL LSN" - stats_reset_time: usage: "GAUGE" description: "Time at which these statistics were last reset" pg_stat_bgwriter: runonserver: "<17.0.0" query: | SELECT checkpoints_timed , checkpoints_req , checkpoint_write_time , checkpoint_sync_time , buffers_checkpoint , buffers_clean , maxwritten_clean , buffers_backend , buffers_backend_fsync , buffers_alloc FROM pg_catalog.pg_stat_bgwriter metrics: - checkpoints_timed: usage: "COUNTER" description: "Number of scheduled checkpoints that have been performed" - checkpoints_req: usage: "COUNTER" description: "Number of requested checkpoints that have been performed" - checkpoint_write_time: usage: "COUNTER" description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in milliseconds" - checkpoint_sync_time: usage: "COUNTER" description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in milliseconds" - buffers_checkpoint: usage: "COUNTER" description: "Number of buffers written during checkpoints" - buffers_clean: usage: "COUNTER" description: "Number of buffers written by the background writer" - maxwritten_clean: usage: "COUNTER" description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" - buffers_backend: usage: "COUNTER" description: "Number of buffers written directly by a backend" - buffers_backend_fsync: usage: "COUNTER" description: "Number of times a backend had to execute its own fsync call (normally the background writer handles those even when the backend does its own write)" - buffers_alloc: usage: "COUNTER" description: "Number of buffers allocated" pg_stat_bgwriter_17: runonserver: ">=17.0.0" name: pg_stat_bgwriter query: | SELECT buffers_clean , maxwritten_clean , buffers_alloc , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time FROM pg_catalog.pg_stat_bgwriter metrics: - buffers_clean: usage: "COUNTER" description: "Number of buffers written by the background writer" - maxwritten_clean: usage: "COUNTER" description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" - buffers_alloc: usage: "COUNTER" description: "Number of buffers allocated" - stats_reset_time: usage: "GAUGE" description: "Time at which these statistics were last reset" pg_stat_checkpointer: runonserver: ">=17.0.0" query: | SELECT num_timed AS checkpoints_timed , num_requested AS checkpoints_req , restartpoints_timed , restartpoints_req , restartpoints_done , write_time , sync_time , buffers_written , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time FROM pg_catalog.pg_stat_checkpointer metrics: - checkpoints_timed: usage: "COUNTER" description: "Number of scheduled checkpoints that have been performed" - checkpoints_req: usage: "COUNTER" description: "Number of requested checkpoints that have been performed" - restartpoints_timed: usage: "COUNTER" description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" - restartpoints_req: usage: "COUNTER" description: "Number of requested restartpoints that have been performed" - restartpoints_done: usage: "COUNTER" description: "Number of restartpoints that have been performed" - write_time: usage: "COUNTER" description: "Total amount of time that has been spent in the portion of processing checkpoints and restartpoints where files are written to disk, in milliseconds" - sync_time: usage: "COUNTER" description: "Total amount of time that has been spent in the portion of processing checkpoints and restartpoints where files are synchronized to disk, in milliseconds" - buffers_written: usage: "COUNTER" description: "Number of buffers written during checkpoints and restartpoints" - stats_reset_time: usage: "GAUGE" description: "Time at which these statistics were last reset" pg_stat_database: query: | SELECT datname , xact_commit , xact_rollback , blks_read , blks_hit , tup_returned , tup_fetched , tup_inserted , tup_updated , tup_deleted , conflicts , temp_files , temp_bytes , deadlocks , blk_read_time , blk_write_time FROM pg_catalog.pg_stat_database metrics: - datname: usage: "LABEL" description: "Name of this database" - xact_commit: usage: "COUNTER" description: "Number of transactions in this database that have been committed" - xact_rollback: usage: "COUNTER" description: "Number of transactions in this database that have been rolled back" - blks_read: usage: "COUNTER" description: "Number of disk blocks read in this database" - blks_hit: usage: "COUNTER" description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)" - tup_returned: usage: "COUNTER" description: "Number of rows returned by queries in this database" - tup_fetched: usage: "COUNTER" description: "Number of rows fetched by queries in this database" - tup_inserted: usage: "COUNTER" description: "Number of rows inserted by queries in this database" - tup_updated: usage: "COUNTER" description: "Number of rows updated by queries in this database" - tup_deleted: usage: "COUNTER" description: "Number of rows deleted by queries in this database" - conflicts: usage: "COUNTER" description: "Number of queries canceled due to conflicts with recovery in this database" - temp_files: usage: "COUNTER" description: "Number of temporary files created by queries in this database" - temp_bytes: usage: "COUNTER" description: "Total amount of data written to temporary files by queries in this database" - deadlocks: usage: "COUNTER" description: "Number of deadlocks detected in this database" - blk_read_time: usage: "COUNTER" description: "Time spent reading data file blocks by backends in this database, in milliseconds" - blk_write_time: usage: "COUNTER" description: "Time spent writing data file blocks by backends in this database, in milliseconds" pg_stat_replication: primary: true query: | SELECT usename , COALESCE(application_name, '') AS application_name , COALESCE(client_addr::text, '') AS client_addr , COALESCE(client_port::text, '') AS client_port , EXTRACT(EPOCH FROM backend_start) AS backend_start , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds FROM pg_catalog.pg_stat_replication metrics: - usename: usage: "LABEL" description: "Name of the replication user" - application_name: usage: "LABEL" description: "Name of the application" - client_addr: usage: "LABEL" description: "Client IP address" - client_port: usage: "LABEL" description: "Client TCP port" - backend_start: usage: "COUNTER" description: "Time when this process was started" - backend_xmin_age: usage: "COUNTER" description: "The age of this standby's xmin horizon" - sent_diff_bytes: usage: "GAUGE" description: "Difference in bytes from the last write-ahead log location sent on this connection" - write_diff_bytes: usage: "GAUGE" description: "Difference in bytes from the last write-ahead log location written to disk by this standby server" - flush_diff_bytes: usage: "GAUGE" description: "Difference in bytes from the last write-ahead log location flushed to disk by this standby server" - replay_diff_bytes: usage: "GAUGE" description: "Difference in bytes from the last write-ahead log location replayed into the database on this standby server" - write_lag_seconds: usage: "GAUGE" description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" - flush_lag_seconds: usage: "GAUGE" description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" - replay_lag_seconds: usage: "GAUGE" description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" pg_settings: query: | SELECT name, CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting FROM pg_catalog.pg_settings WHERE vartype IN ('integer', 'real', 'bool') ORDER BY 1 metrics: - name: usage: "LABEL" description: "Name of the setting" - setting: usage: "GAUGE" description: "Setting value"