Postgresql 高可用安装
PostgreSQL 13.7 安装
部署架构
资源规划
主机名 | IP | 操作系统 | PG角色 | ETCD | Patroni | HA Proxy | Keepalived | Grafana | Prometheus |
---|---|---|---|---|---|---|---|---|---|
postgresql-1 | 10.18.1.1 | CentOS-7.9 | primary | √ | √ | √ | √ | - | - |
postgresql-2 | 10.18.1.2 | CentOS-7.9 | hot standby | √ | √ | √ | √ | - | - |
postgresql-3 | 10.18.1.3 | CentOS-7.9 | hot standby | √ | √ | - | - | √ | √ |
端口矩阵
程序 | 端口 | 描述 |
---|---|---|
PostgreSQL | 5432 | 数据库端口 |
Patroni | 8008 | RestAPI 端口 |
etcd | 2379、2380 | 客户端及服务端监听端口 |
haproxy | 1080、35432、35433 | Web 及数据库端口 |
prometheus | 9090 | Web 端口 |
grafana | 3000 | Web 端口 |
node_exporter | 9100 | metrics 端口 |
postgres_exporter | 9187 | metrics 端口 |
haproxy_exporter | 9101 | metrics 端口 |
组件版本
-
Python-3.6
-
Patroni-2.1.4
-
HAProxy-2.4.18下载地址
-
Keepalived-2.2.4 下载地址
-
Prometheus-2.37.0(LTS)下载地址
-
Grafana-8.5.11 下载地址
安装
基础包安装及配置
# 安装基础包
# 离线下载
# yum -y install vim bash-completion wget curl wget gcc make flex libselinux-devel readline-devel zlib zlib-devel openssl-devel pam-devel systemd-devel libpq5 libpq5-devel --downloadonly --downloaddir=/root/pkg/base
# 在线安装
yum -y install vim bash-completion wget curl wget \
gcc make flex libselinux-devel readline-devel \
zlib zlib-devel openssl-devel pam-devel systemd-devel \
libpq5 libpq5-devel --downloadonly --downloaddir=/root/pkg/base
# 前置检查
# 禁用防火墙
systemctl stop firewalld.service
systemctl disable firewalld.service
# 需要确定网络后再禁用
#systemctl stop NetworkManager.service
#systemctl disable NetworkManager.service
# 关闭 selinux
sed -i "s/SELINUX=enforcing/SELINUX=disabled/g" /etc/selinux/config
setenforce 0
# 修改主机名
hostnamectl set-hostname postgres-{1..3}
# 修改时间同步
timedatectl
timedatectl set-timezone Asia/Shanghai
timedatectl set-ntp true
# 手动配置时间
# date -s "20211001 00:00:00"
# 同步硬件时间
# hwclock --hctosys
# 下载离线包
# sudo yum install -y chrony --downloadonly --downloaddir=/root/pkg/base
# 在线安装
sudo yum install -y chrony
# 配置时间同步主备服务器
# 主
cat << EOF > /etc/chrony.conf
server ntp.aliyun.con iburst
driftfile /var/lib/chrony/drift
hwclockfile /etc/adjtime
allow 10.18.0.0/16
local stratum 10
log statistics measurements tracking
logdir /var/log/chrony
maxupdateskew 100.0
EOF
# 备
cat << EOF >>/etc/chrony.conf
server 10.18.1.1 iburst
driftfile /var/lib/chrony/drift
hwclockfile /etc/adjtime
log statistics measurements tracking
logdir /var/log/chrony
maxupdateskew 100.0
EOF
# 重启时间同步服务
systemctl restart chronyd
# 配置解析
cat << EOF >>/etc/hosts
10.18.1.1 postgres-1
10.18.1.2 postgres-2
10.18.1.3 postgres-3
EOF
# 配置数据目录
mkdir -p /data/app/{patroni,grafana,prometheus,sources}
mkdir -p /data/{pgdata,pgarchive}
# 更改 PG 目录所属
chown -R postgres:postgres /data/pgdata
chown -R postgres:postgres /data/pgarchive
chmod 0700 /data/pgdata
chmod 0700 /data/pgarchive
PostgreSQL
# Install the repository RPM:
curl -o /roog/pgk/base/pgdg-redhat-repo-latest.noarch.rpm https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm
sudo yum install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-7-x86_64/pgdg-redhat-repo-latest.noarch.rpm
# 同步下载官方源
# sudo yum install -y yum-utils createrepo --downloadonly --downloaddir=/root/pkg/base
sudo yum install -y yum-utils createrepo
# 同步仓库源中所有包
# sudo reposync -r pgdg13 -p /root/pkg/pgdg13
# 参考,内核参数优化
cat >> /etc/sysctl.conf << EOF
#for postgres db 13.7
kernel.shmall = 966327 # expr `free |grep Mem|awk '{print $2 *1024}'` / `getconf PAGE_SIZE`
kernel.shmmax = 3958075392 # free |grep Mem|awk '{print $2 *1024}'
kernel.shmmni = 4096
kernel.sem = 50100 64128000 50100 1280
fs.file-max = 76724200
net.ipv4.ip_local_port_range = 9000 65000
net.core.rmem_default = 1048576
net.core.rmem_max = 4194304
net.core.wmem_default = 262144
net.core.wmem_max = 1048576
net.ipv4.tcp_tw_recycle = 1
net.ipv4.tcp_max_syn_backlog = 4096
net.core.netdev_max_backlog = 10000
vm.overcommit_memory = 0
fs.aio-max-nr = 40960000
net.ipv4.tcp_timestamps = 0
vm.dirty_ratio=20
vm.dirty_background_ratio=3
vm.dirty_writeback_centisecs=100
vm.dirty_expire_centisecs=500
vm.swappiness=10
vm.min_free_kbytes=524288
EOF
# 参考,修改资源限制
cat >> /etc/security/limits.conf << EOF
#for postgres db 13.3
* soft nofile 1048576
* hard nofile 1048576
* soft nproc unlimited
* hard nproc unlimited
* soft core unlimited
* hard core unlimited
* soft memlock unlimited
* hard memlock unlimited
EOF
# 修改所有节点环境变量
cat >> /etc/profile << EOF
export LANG=en_US.utf8
export PGHOME=/usr/pgsql-13/
export PGUSER=postgres
export PGPORT=5432
export PGDATA=/data/pgdata
export PATH=\$PGHOME/bin:\$PATH:\$HOME/bin
EOF
# Install PostgreSQL:
# sudo yum install -y postgresql13-server-13.7 postgresql13-contrib-13.7 --downloadonly --downloaddir=/root/pkg/base
sudo yum install -y postgresql13-server-13.7 postgresql13-contrib-13.7
# 更改配置
echo 'export PGDATA=/data/pgdata' >> /etc/profile
echo 'export PGHOME=/usr/pgsql-13' >> /etc/profile
source /etc/profile
# 修改服务中的默认 PGDATA 目录,后续会禁用 postgresql-13 服务,由 patroni 接管
sed -i 's/PGDATA=\/var\/lib\/pgsql\/13\/data\//PGDATA=\/data\/pgdata\//g' /usr/lib/systemd/system/postgresql-13.service
# Optionally initialize the database and enable automatic start:
# 官方 rpm 安装包初始化数据库方式,可选
# sudo /usr/pgsql-13/bin/postgresql-13-setup initdb --data-checksums
# 使用 initdb 初始化数据库,更改 wal-segsize 大小,默认 16M,数据量大时,会产生大量文件,值为 2 的 N 次方。修改之后确保 min_wal_size 至少是此值的 2 倍大小。
# patroni 是使用 pg_ctl initdb 初始化,部分选项不支持,如:wal-segsize
/usr/pgsql-13/bin/initdb --wal-segsize=64 --data-checksums --encoding=UTF8 --locale=C --allow-group-access --pgdata=/data/pgdata
# 启动服务,后续通过 patroni 启动
# sudo systemctl enable postgresql-13
# sudo systemctl start postgresql-13
# 所有节点配置密码文件
su - postgres
cat >> ~/.pgpass << EOF
# hostname:port:database:username:password
10.18.1.1:5432:replication:replica:d64ea0ae96efaf336dd8b62ebecee930
10.18.1.2:5432:replication:replica:d64ea0ae96efaf336dd8b62ebecee930
10.18.1.3:5432:replication:replica:d64ea0ae96efaf336dd8b62ebecee930
EOF
chmod 0600 .pgpass
# 手动执行备份
# pg_basebackup -h 192.168.58.10 -p 5432 -U replica -D $PGDATA -Fp -P -X stream -R -v
# 测试主备
# psql -c 'select pid,state,client_addr,sync_priority,sync_state from pg_stat_replication;'
Etcd
# 下载离线包
# sudo yum install -y etcd --downloadonly --downloaddir=/root/pkg/base
wget https://github.com/etcd-io/etcd/releases/download/v3.5.1/etcd-v3.5.1-linux-amd64.tar.gz
# 解压
tar -zxvf etcd-v3.5.1-linux-amd64.tar.gz -C /data/app
# 创建 etcd 启动脚本
[root@pgtest1 ~]# vi /data/app/etcd-v3.5.1-linux-amd64/start_etcd.sh
/data/app/etcd-v3.5.1-linux-amd64/etcd --data-dir=/data/app/etcd-v3.5.1-linux-amd64/data \
--name etcd_posgresql_1 \
--listen-peer-urls http://10.18.1.1:2380 \
--listen-client-urls http://10.18.1.1:2379,http://127.0.0.1:2379 \
--initial-advertise-peer-urls http://10.18.1.1:2380 \
--advertise-client-urls http://10.18.1.1:2379 \
--initial-cluster-token etcd-cluster-posgresql \
--initial-cluster etcd_posgresql_1=http://10.18.1.1:2380,etcd_posgresql_2=http://10.18.1.2:2380,etcd_posgresql_3=http://10.18.1.3:2380 \
--initial-cluster-state new \
--enable-v2
# 添加执行权限
chmod +x /data/app/etcd-v3.5.1-linux-amd64/start_etcd.sh
# 配置服务
cat << EOF >> /usr/lib/systemd/system/etcd.service
[Unit]
Description=etcd
After=network.target remote-fs.target nss-lookup.target
[Service]
Type=forking
ExecStart=/bin/bash -c "/data/app/etcd-v3.5.1-linux-amd64/start_etcd.sh >> /data/app/etcd-v3.5.1-linux-amd64/start_etcd.log 2>&1 &"
ExecStop=/usr/bin/killall start_etcd
[Install]
WantedBy=multi-user.target
EOF
echo 'export PATH=/data/app/etcd-v3.5.1-linux-amd64:$PATH' >> /etc/profile
# 启动
systemctl daemon-reload
systemctl start etcd.service
systemctl enable etcd.service
# ETCD 常用维护
# 检查集群节点的状态
etcdctl endpoint status --cluster -w table
# 列出集群中的所有成员
etcdctl member list -w table
# 检查集群节点健康状况
etcdctl endpoint health --cluster -w table
# 将领导权转移到另一个 etcd 集群成员,在leader节点上执行
etcdctl endpoint status --cluster -w table
etcdctl move-leader 225e005000422db3
# 保存数据库快照(数据库备份),只在其中一个节点执行
etcdctl snapshot save etcd_bak.db
# 查看快照信息
etcdctl snapshot status etcd_bak.db -w table
# 添加新成员
etcdctl member add etcd_pgtest04 http://10.18.1.4:2380
# 删除成员
etcdctl member remove caef4208a95efee8.
Patroni
# 卸载 python3
sudo yum remove python3 python3-devel
sudo yum history list python3-devel
sudo yum history undo 5
# 离线下载 python3
sudo yum install -y python3 python3-devel --downloadonly --downloaddir=/root/pkg/python
# 替换默认 yum 源中的 python
sed -i "s:\<python\>:python2:g" /usr/bin/yum
sed -i "s:\<python\>:python2:g" /usr/libexec/urlgrabber-ext-down
sed -i "s:\<python\>:python2:g" /usr/share/createrepo/*.py
# 切换默认 python 软链接
ln -sf /usr/bin/python3 /usr/bin/python
# 离线下载 patroni
pip3 install patroni[etcd] -i https://mirrors.aliyun.com/pypi/simple/ --download /root/pkg/patroni-pip
pip3 install psycopg2-binary -i https://mirrors.aliyun.com/pypi/simple/ --download /root/pkg/patroni-pip
pip3 install psycopg2 -i https://mirrors.aliyun.com/pypi/simple/ --download /root/pkg/patroni-pip
# 打包
tar -zcvf patroni_etcd.tar.gz /root/pkg/patroni-pip
# 执行安装
pip3 install /data/pkg/pkg/patroni-pip/psutil-5.9.2.tar.gz
pip3 install /data/pkg/pkg/patroni-pip/ydiff-1.2.tar.gz
pip3 install /data/pkg/pkg/patroni-pip/*.whl
pip3 install /data/pkg/pkg/patroni-pip/python-etcd-0.4.5.tar.gz
pip3 install /data/pkg/pkg/patroni-pip/psycopg2-binary-2.9.3.tar.gz
# 如果报错 ./psycopg/psycopg.h:36:22: fatal error: libpq-fe.h: No such file or directory 执行如下安装
# sudo yum install -y libpq5 libpq5-devel --downloadonly --downloaddir=/root/pkg/python
pip3 install /data/pkg/pkg/patroni-pip/psycopg2-2.9.3.tar.gz
pip3 list
click (8.0.4)
dnspython (2.2.1)
importlib-metadata (4.8.3)
patroni (2.1.4)
pip (9.0.3)
prettytable (2.5.0)
psutil (5.9.2)
psycopg2 (2.9.3)
psycopg2-binary (2.9.3)
python-dateutil (2.8.2)
python-etcd (0.4.5)
PyYAML (6.0)
setuptools (39.2.0)
six (1.16.0)
typing-extensions (4.1.1)
urllib3 (1.26.12)
wcwidth (0.2.5)
ydiff (1.2)
zipp (3.6.0)
# 主节点创建 patroni.yml
cat << EOF > /data/app/patroni/patroni.yml
scope: pg_cluster
namespace: /pg
name: postgres-1
log:
level: INFO
traceback_level: ERROR
dir: /data/app/patroni
file_num: 20
file_size: 104857600
dateformat: '%Y-%m-%d %H:%M:%S %z'
format: '%(asctime)s %(levelname)s: %(message)s'
tags:
nofailover: false
noloadbalance: false
clonefrom: false
nosync: false
restapi:
listen: 10.18.1.1:8008
connect_address: 10.18.1.1:8008
etcd:
hosts:
- 10.18.1.1:2379
- 10.18.1.2:2379
- 10.18.1.3:2379
bootstrap:
method: initdb
custom-init:
command: "/usr/pgsql-13/bin/initdb --wal-segsize=64 --data-checksums --encoding=UTF8 --locale=C --allow-group-access --pgdata=/data/pgdata"
initdb:
- data-checksums
- encoding: UTF8
- locale: C
# - wal-segsize: 64MB
# - allow-group-access
- pgdata: /data/pgdata
users:
replica:
password: d64ea0ae96efaf336dd8b62ebecee930
options:
- createrole
- replication
postgres:
password: 6edef2d746f2274cab951a452d5fc13d
rewind:
password: 75091230ae7b655a9bd693eb202eb06a
options:
- createrole
pg_hba:
# TYPE DATABASE USER ADDRESS METHOD
- local all all peer
- host all all 127.0.0.1/32 md5
- host all all 10.18.0.0/16 md5
- local replication all peer
- host replication all 127.0.0.1/32 md5
- host replication replica 10.18.1.0/24 md5
dcs:
# the number of seconds the loop will sleep. Default value: 10
# this is patroni check loop interval
loop_wait: 10
# the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
# config this according to your network condition to avoid false-positive failover
ttl: 30
# timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
retry_timeout: 10
# the amount of time a master is allowed to recover from failures before failover is triggered (in seconds)
# Max RTO: 2 loop wait + master_start_timeout
master_start_timeout: 300
# The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled
master_stop_timeout: 30
# import: candidate will not be promoted if replication lag is higher than this
# maximum RPO: 1MB
maximum_lag_on_failover: 1048576
# turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election
# set to true for RPO mode
synchronous_mode: false
# prevents disabling synchronous replication if no synchronous replicas are available, blocking all client writes to the master
synchronous_mode_strict: false
postgresql:
use_pg_rewind: true
parameters:
# IMPORTANT PARAMETERS
max_connections: 500 # 100-800
superuser_reserved_connections: 10 # reserve 10 connection for su
max_locks_per_transaction: 128 # 64 -> 128
max_prepared_transactions: 0 # 0 disable 2PC
track_commit_timestamp: on # enabled xact timestamp
max_worker_processes: 32 # default 8 -> 64, set to cpu core 64
wal_level: replica # logical replica
wal_log_hints: on # wal log hints to support rewind
max_wal_senders: 24 # 10 -> 24
max_replication_slots: 16 # 10 -> 16
wal_keep_size: 100GB # keep at least 100GB WAL
password_encryption: md5 # use traditional md5 auth
# RESOURCE USAGE
shared_buffers: 16GB # 1/4 of total memory
maintenance_work_mem: 2GB
work_mem: 32MB # 4MB -> 32MB
huge_pages: try # try huge pages
vacuum_cost_delay: 5ms # wait 2ms per 10000 cost
vacuum_cost_limit: 10000 # 10000 cost each round
bgwriter_delay: 10ms # check dirty page every 10ms
bgwriter_lru_maxpages: 800 # 100 -> 800
bgwriter_lru_multiplier: 5.0 # 2.0 -> 5.0 more cushion buffer
max_parallel_workers: 32 # default 8 -> 32, limit by max_worker_processes
max_parallel_maintenance_workers: 8 # default 2 -> 8, limit by parallel worker
max_parallel_workers_per_gather: 0 # default 2 -> 0, disable parallel query in OLTP mode
# WAL
wal_buffers: 16MB # max to 16MB
wal_writer_delay: 20ms # wait period
wal_writer_flush_after: 1MB # max allowed data loss
min_wal_size: 100GB # at least 100GB WAL
max_wal_size: 400GB # at most 400GB WAL
commit_delay: 20 # 200ms -> 20ms, increase speed
commit_siblings: 10 # 5 -> 10
checkpoint_timeout: 30min # checkpoint 5min -> 1h
checkpoint_completion_target: 0.95 # 0.5 -> 0.95
archive_mode: on
archive_command: "cp %p /data/pgarchive/%f"
# REPLICATION
# synchronous_standby_names: ''
vacuum_defer_cleanup_age: 50000 # 0->50000 last 50000 xact changes will not be vacuumed
promote_trigger_file: promote.signal # default promote trigger file path
max_standby_archive_delay: 10min # max delay before canceling queries when reading WAL from archive;
max_standby_streaming_delay: 3min # max delay before canceling queries when reading streaming WAL;
wal_receiver_status_interval: 1s # send replies at least this often
hot_standby_feedback: on # send info from standby to prevent query conflicts
wal_receiver_timeout: 60s # time that receiver waits for
max_logical_replication_workers: 8 # 4 -> 8, 6 sync worker + 1~2 apply worker
max_sync_workers_per_subscription: 6 # 2 -> 6, 6 sync worker
# QUERY TUNING
random_page_cost: 4 # 4 for HDD, 1.1 for SSD
effective_cache_size: 48GB # max mem - shared buffer
default_statistics_target: 1000 # stat bucket 100 -> 1000
# REPORTING AND LOGGING
log_destination: csvlog # use standard csv log
logging_collector: on # enable csvlog
log_directory: log # default log dir: /pg/data/log
# log_filename: 'postgresql-%a.log' # weekly auto-recycle
log_filename: 'postgresql-%Y-%m-%d.log' # YYYY-MM-DD full log retention
log_checkpoints: on # log checkpoint info
log_lock_waits: on # log lock wait info
log_replication_commands: on # log replication info
log_statement: ddl # log ddl change
log_min_duration_statement: 100 # log slow query (>100ms)
# STATISTICS
track_io_timing: on # collect io statistics
track_functions: all # track all functions (none|pl|all)
track_activity_query_size: 8192 # max query length in pg_stat_activity
# AUTOVACUUM
log_autovacuum_min_duration: 1s # log autovacuum activity take more than 1s
autovacuum_max_workers: 3 # default autovacuum worker 3
autovacuum_naptime: 1min # default autovacuum naptime 1min
autovacuum_vacuum_scale_factor: 0.08 # fraction of table size before vacuum 20% -> 8%
autovacuum_analyze_scale_factor: 0.04 # fraction of table size before analyze 10% -> 4%
autovacuum_vacuum_cost_delay: -1 # default vacuum cost delay: same as vacuum_cost_delay
autovacuum_vacuum_cost_limit: -1 # default vacuum cost limit: same as vacuum_cost_limit
autovacuum_freeze_max_age: 100000000 # age > 1 billion triggers force vacuum
# CLIENT
deadlock_timeout: 50ms # 50ms for deadlock
idle_in_transaction_session_timeout: 30min # 10min timeout for idle in transaction
# CUSTOMIZED OPTIONS
# extensions
shared_preload_libraries: 'pg_stat_statements, auto_explain"'
# auto_explain
auto_explain.log_min_duration: 1s # auto explain query slower than 1s
auto_explain.log_analyze: true # explain analyze
auto_explain.log_verbose: true # explain verbose
auto_explain.log_timing: true # explain timing
auto_explain.log_nested_statements: true
# pg_stat_statements
pg_stat_statements.max: 10000 # 5000 -> 10000 queries
pg_stat_statements.track: all # track all statements (all|top|none)
pg_stat_statements.track_utility: off # do not track query other than CRUD
pg_stat_statements.track_planning: off # do not track planning metrics
hot_standby: "on"
restore_command: "cp /data/pgarchive/%f %p"
postgresql:
listen: 0.0.0.0:5432
connect_address: 10.18.1.17:5432
data_dir: /data/pgdata
pgpass: /var/lib/pgsql/.pgpass
pg_ctl_timeout: 60
# rewind policy: data checksum should be enabled before using rewind
use_pg_rewind: true
remove_data_directory_on_rewind_failure: true
remove_data_directory_on_diverged_timelines: false
use_unix_socket: true # default: /var/run/postgresql, /tmp
authentication:
replication:
username: replica
password: d64ea0ae96efaf336dd8b62ebecee930
superuser:
username: postgres
password: 6edef2d746f2274cab951a452d5fc13d
rewind:
username: rewind
password: 75091230ae7b655a9bd693eb202eb06a
EOF
# 配置参考 https://pigsty.cc/
# 修改目录权限
chown -R postgres:postgres /data/app/patroni
# 创建服务
cat << EOF > /usr/lib/systemd/system/patroni.service
[Unit]
Description=patroni
After=network.target remote-fs.target nss-lookup.target etcd.service
Requires=etcd.service
[Service]
Type=simple
User=postgres
Group=postgres
Environment="PGHOME=/usr/pgsql-13"
Environment="PGDATA=/data/pgdata"
Environment="PGPORT=5432"
Environment="LD_LIBRARY_PATH=/usr/pgsql-13/lib"
Environment="PATH=/usr/pgsql-13/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:\$PATH"
ExecStart=/usr/local/bin/patroni /data/app/patroni/patroni.yml
ExecReload=/bin/kill -s HUP \$MAINPID
ExecStop=/usr/bin/killall patroni
KillMode=process
TimeoutSec=30
Restart=no
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
systemctl daemon-reload
systemctl start patroni.service
systemctl enable patroni.service
# 创建 role
su - postgres
psql -c "create role replica replication login password 'd64ea0ae96efaf336dd8b62ebecee930';"
psql -c "create role replica login password '75091230ae7b655a9bd693eb202eb06a';"
# patroni 常用操作
# 查看集群状态
patronictl -c /data/app/patroni/patroni.yml list
# 查看集群生效配置
patronictl -c /data/app/patroni/patroni.yml show-config
# 编辑集群配置
patronictl -c /data/app/patroni/patroni.yml edit-config
# 切换集群主节点
patronictl -c /data/app/patroni/patroni.yml switchover
# 查看集群主备切换历史
patronictl -c /data/app/patroni/patroni.yml history
HAProxy
# sudo yum install -y haproxy --downloadonly --downloaddir=/root/pkg/base
wget https://www.haproxy.org/download/2.4/src/haproxy-2.4.18.tar.gz
tar -xzvf haproxy-2.4.18.tar.gz -C /data/app
cd /data/app/haproxy-2.4.18
make TARGET=linux-glibc ARCH=x86_64 PREFIX=/data/app/haproxy-2.4.18 USE_PCRE=1 USE_OPENSSL=1 USE_ZLIB=1 USE_SYSTEMD=1 USE_CPU_AFFINITY=1
# 参数说明
# make help
# TARGET=linux-glibc # 内核版本
# ARCH=x86_64 # 指定CPU的架构为"x86_64"
# PREFIX=/data/app/haproxy # 指定haprpxy安装路径
# USE_PCRE=1 # 开启正则表达式。
# USE_OPENSSL=1 # 开启OPENSSL功能。
# USE_ZLIB=1 # 开启压缩和解压缩功能。
# USE_SYSTEMD=1 # 支持以"systemd"的方式启动。
# USE_CPU_AFFINITY=1 # 开启CPU的亲和性。
echo 'export PATH=/data/app/haproxy-2.4.18:$PATH' >> /etc/profile
source /etc/profile
# 主备节点添加 haproxy
cat << EOF > /data/app/haproxy-2.4.18/haproxy.cfg
global
log 127.0.0.1 local2
chroot /data/app/haproxy-2.4.18
maxconn 6000
user root
group root
daemon
nbproc 16
defaults
mode tcp
log 127.0.0.1 local2 err
option tcplog
option dontlognull
option redispatch
retries 3
maxconn 5000
timeout queue 1m
timeout connect 10s
timeout client 1m
timeout server 1m
timeout check 5s
listen status
bind *:1080
mode http
log global
stats enable
stats refresh 30s
stats uri /
stats realm Private lands
stats auth admin:hollysysAdmin1
listen master
bind *:35432
mode tcp
option tcplog
balance roundrobin
option httpchk OPTIONS /master
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server 1postgres-1 10.18.1.1:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
server 1postgres-2 10.18.1.2:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
server 1postgres-3 10.18.1.3:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
listen replicas
bind *:35433
mode tcp
option tcplog
balance roundrobin
option httpchk OPTIONS /replica
http-check expect status 200
default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server 1postgres-1 10.18.1.1:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
server 1postgres-2 10.18.1.2:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
server 1postgres-3 10.18.1.3:5432 maxconn 3000 check port 8008 inter 5000 rise 2 fall 2
EOF
cat << EOF > /usr/lib/systemd/system/haproxy.service
[Unit]
Description=HAProxy Load Balancer
After=network-online.target
Wants=network-online.target
[Service]
Environment="CONFIG=/data/app/haproxy-2.4.18/haproxy.cfg" "PIDFILE=/run/haproxy.pid" "EXTRAOPTS=-S /run/haproxy-master.sock"
ExecStartPre=/data/app/haproxy-2.4.18/haproxy -Ws -f \$CONFIG -c -q \$EXTRAOPTS
ExecStart=/data/app/haproxy-2.4.18/haproxy -Ws -f \$CONFIG -p \$PIDFILE \$EXTRAOPTS
ExecReload=/data/app/haproxy-2.4.18/haproxy -Ws -f \$CONFIG -c -q \$EXTRAOPTS
ExecReload=/bin/kill -USR2 \$MAINPID
KillMode=mixed
Restart=always
SuccessExitStatus=143
Type=notify
[Install]
WantedBy=multi-user.target
EOF
# 启动服务
systemctl daemon-reload
systemctl start haproxy
systemctl status haproxy
systemctl enable haproxy
Keepalived
# 下载离线包
wget https://www.keepalived.org/software/keepalived-2.2.4.tar.gz
tar -xzvf keepalived-2.2.4.tar.gz -C /data/app/sources
cd /data/app/sources/keepalived-2.2.4
# 编译
./configure --prefix=/data/app/keepalived-2.2.4
# 如果报错,则安装对应的包
# *** WARNING - this build will not support IPVS with IPv6. Please install libnl/libnl-3 dev libraries to support IPv6 with IPVS
sudo yum install -y libnl3 libnl3-devel --downloadonly --downloaddir=/root/pkg/base
# 安装
make && make install
# 配置环境变量
echo 'export PATH=/data/app/keepalived-2.2.4/sbin:$PATH' >> /etc/profile
source /etc/profile
keepalived -v
# keepalived 主节点配置文件
cp /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf.bak
cat << EOF > /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
router_id miot-postgres-1
script_user root
enable_script_security
}
vrrp_script chk_haproxy {
script "/usr/bin/killall -0 haproxy"
interval 2
weight 5
fall 3
rise 5
timeout 2
}
vrrp_instance VI_1 {
state MASTER
interface bond0
virtual_router_id 88
priority 100
advert_int 5
authentication {
auth_type PASS
auth_pass postgres
}
virtual_ipaddress {
10.18.1.240/24 dev bond0 label bond0:1
}
track_script {
chk_haproxy
}
}
EOF
# keepalived 备节点配置文件
cp /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf.bak
cat << EOF > /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf
! Configuration File for keepalived
global_defs {
router_id miot-postgres-2
script_user root
enable_script_security
}
vrrp_script chk_haproxy {
script "/usr/bin/killall -0 haproxy"
interval 2
weight 5
fall 3
rise 5
timeout 2
}
vrrp_instance VI_1 {
state BACKUP
interface bond0
virtual_router_id 88
priority 99
advert_int 5
authentication {
auth_type PASS
auth_pass postgres
}
virtual_ipaddress {
10.18.1.240/24 dev bond0 label bond0:1
}
track_script {
chk_haproxy
}
}
EOF
# keepalived.conf 中会使用到 killall 监听 haproxy 程序
sudo yum install -y psmisc --downloadonly --downloaddir=/root/pkg/base
# 替换服务
cat << EOF > /usr/lib/systemd/system/keepalived.service
[Unit]
Description=LVS and VRRP High Availability Monitor
After=network-online.target syslog.target
Wants=network-online.target
Documentation=man:keepalived(8)
Documentation=man:keepalived.conf(5)
Documentation=man:genhash(1)
Documentation=https://keepalived.org
[Service]
Type=notify
NotifyAccess=all
PIDFile=/run/keepalived.pid
KillMode=process
EnvironmentFile=-/data/app/keepalived-2.2.4/etc/sysconfig/keepalived
ExecStart=/data/app/keepalived-2.2.4/sbin/keepalived -f /data/app/keepalived-2.2.4/etc/keepalived/keepalived.conf \$KEEPALIVED_OPTIONS
ExecReload=/bin/kill -HUP \$MAINPID
[Install]
WantedBy=multi-user.target
EOF
systemctl daemon-reload
systemctl start keepalived
systemctl enable keepalived
Prometheus
# 离线安装 prometheus
ls -l prometheus
total 67024
-rw-r--r-- 1 root root 12596988 9月 17 03:47 alertmanager-0.24.0-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 4891676 9月 17 03:47 blackbox_exporter-0.22.0-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 3422080 9月 17 03:47 haproxy_exporter-0.13.0-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 4199640 9月 17 03:47 node_exporter-1.3.1-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 3142508 9月 17 03:47 pgbouncer_exporter-0.4.1-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 3602872 9月 17 03:47 postgres_exporter-0.11.1-1.el7.x86_64.rpm
-rw-r--r-- 1 root root 36759220 9月 17 03:47 prometheus2-2.37.0-1.el7.x86_64.rpm
rpm -i --force pkg/prometheus/*.rpm
# 修改配置文件
mv /etc/prometheus /root/prometheus
# cp -r scripts/prometheus /etc/prometheus
# 修改 prometheus 数据目录,exporter 数据目录不用修改
cat /etc/default/prometheus
PROMETHEUS_OPTS='--config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus/data --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles'
sed -i 's/\/var\/lib\/prometheus\/data/\/data\/app\/prometheus\/prometheus/g' /etc/default/prometheus
# 添加 --storage.tsdb.retention.time=180d --storage.tsdb.retention.size=50GB
# 设置开机启动
systemctl enable --now prometheus alertmanager blackbox_exporter haproxy_exporter node_exporter pgbouncer_exporter postgres_exporter
Grafana
# 离线安装
rpm -ivh grafana/*.rpm
systemctl enable --now
调优
Device
- Cpupower
# set cpu power mode to performance mode if applicable
if (cpupower frequency-info --governors | grep -q "performance"); then
echo "cpupower performance governor is supported"
cpupower frequency-set --governor performance
else
echo "cpupower performance governor not available"
fi
- Numa
yum install -q -y grubby
grubby --update-kernel=/boot/vmlinuz-$(uname -r) --args="numa=off transparent_hugepage=never"
- Transparent-Hugepage
# disable transparent hugepage
if (! grep -q 'disable transparent hugepage' /etc/rc.local); then
echo 'never' >/sys/kernel/mm/transparent_hugepage/enabled
echo 'never' >/sys/kernel/mm/transparent_hugepage/defrag
cat >>/etc/rc.local <<-EOF
# disable transparent hugepage
echo 'never' > /sys/kernel/mm/transparent_hugepage/enabled
echo 'never' > /sys/kernel/mm/transparent_hugepage/defrag
EOF
chmod +x /etc/rc.d/rc.local
fi
- Disk Prefetch
if (! grep -q 'disk prefetch' /etc/rc.local); then
cat >>/etc/rc.local <<-EOF
# disk prefetch
blockdev --setra 16384 $(echo $(blkid | awk -F':' '$1!~"block"{print $1}'))
EOF
chmod +x /etc/rc.d/rc.local
fi
SWAP
swapoff -a
# 禁用 /etc/fstab 中 swap
Limit
cat >> /etc/security/limits.conf <<-EOF
* soft nproc 1048576
* hard nproc 1048576
* hard nofile 1048576
* soft nofile 1048576
* soft stack unlimited
* hard stack unlimited
* soft core unlimited
* hard core unlimited
* soft memlock 2500000
* hard memlock 2500000
root soft nproc 1048576
root hard nproc 1048576
root hard nofile 1048576
root soft nofile 1048576
root soft stack unlimited
root hard stack unlimited
root soft core unlimited
root hard core unlimited
root soft memlock 2500000
root hard memlock 2500000
EOF