PostgreSQL同步复制主库挂起分析
发表于:2024-10-28 作者:千家信息网编辑
千家信息网最后更新 2024年10月28日,这篇文章主要讲解了"PostgreSQL同步复制主库挂起分析",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"PostgreSQL同步复制主库挂起分析"吧
千家信息网最后更新 2024年10月28日PostgreSQL同步复制主库挂起分析
这篇文章主要讲解了"PostgreSQL同步复制主库挂起分析",文中的讲解内容简单清晰,易于学习与理解,下面请大家跟着小编的思路慢慢深入,一起来研究和学习"PostgreSQL同步复制主库挂起分析"吧!
在Streaming Replication环境中PostgreSQL主节点设置为同步复制,如standby节点没有启动或者网络出现问题没法连接到主节点时,主节点如执行DML则进程会挂起,下面分析这个挂起的问题.
一、数据结构
Latch
Latch结构体应被视为opaque"不透明的",并且只能通过公共的函数访问.在这里定义是运行把Latchs作为更大的结构体的一部分.
//通常情况下,int类型的变量通常是原子访问的,也可以认为 sig_atomic_t就是int类型的数据,//因为对这些变量要求一条指令完成,所以sig_atomic_t不可能是结构体,只会是数字类型。typedef int __sig_atomic_t;/* * Latch structure should be treated as opaque and only accessed through * the public functions. It is defined here to allow embedding Latches as * part of bigger structs. * Latch结构体应被视为"不透明的"opaque,并且只能通过公共的函数访问. * 在这里定义是运行把Latchs作为更大的结构体的一部分. */typedef struct Latch{ sig_atomic_t is_set; bool is_shared; int owner_pid;#ifdef WIN32 HANDLE event;#endif} Latch;
二、源码解读
N/A
二、跟踪分析
启动master节点,不启动standby节点,使用psql连接数据库,执行SQL,Session挂起:
testdb=# drop table t1;
使用gdb跟踪挂起的进程
[xdb@localhost ~]$ ps -ef|grep postgresxdb 1318 1 0 12:14 pts/0 00:00:00 /appdb/xdb/pg11.2/bin/postgresxdb 1319 1318 0 12:14 ? 00:00:00 postgres: logger xdb 1321 1318 0 12:14 ? 00:00:00 postgres: checkpointer xdb 1322 1318 0 12:14 ? 00:00:00 postgres: background writer xdb 1323 1318 0 12:14 ? 00:00:00 postgres: walwriter xdb 1324 1318 0 12:14 ? 00:00:00 postgres: autovacuum launcher xdb 1325 1318 0 12:14 ? 00:00:00 postgres: archiver xdb 1326 1318 0 12:14 ? 00:00:00 postgres: stats collector xdb 1327 1318 0 12:14 ? 00:00:00 postgres: logical replication launcher xdb 1331 1318 0 12:15 ? 00:00:00 postgres: xdb testdb [local] DROP TABLE waiting for 0/5D07B668[xdb@localhost ~]$ gdb -p 1331GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7...
查看调用栈
(gdb) bt#0 0x00007f4636d48903 in __epoll_wait_nocancel () from /lib64/libc.so.6#1 0x000000000088e668 in WaitEventSetWaitBlock (set=0x21640e8, cur_timeout=-1, occurred_events=0x7ffc96572f40, nevents=1) at latch.c:1048#2 0x000000000088e543 in WaitEventSetWait (set=0x21640e8, timeout=-1, occurred_events=0x7ffc96572f40, nevents=1, wait_event_info=134217761) at latch.c:1000#3 0x000000000088dcec in WaitLatchOrSocket (latch=0x7f462d5b44d4, wakeEvents=17, sock=-1, timeout=-1, wait_event_info=134217761) at latch.c:385#4 0x000000000088dbcd in WaitLatch (latch=0x7f462d5b44d4, wakeEvents=17, timeout=-1, wait_event_info=134217761) at latch.c:339#5 0x0000000000863e2d in SyncRepWaitForLSN (lsn=1560786536, commit=true) at syncrep.c:286#6 0x0000000000546279 in RecordTransactionCommit () at xact.c:1359#7 0x0000000000546da3 in CommitTransaction () at xact.c:2074#8 0x0000000000547a3f in CommitTransactionCommand () at xact.c:2817#9 0x00000000008be250 in finish_xact_command () at postgres.c:2523#10 0x00000000008bbf45 in exec_simple_query (query_string=0x20a1d78 "drop table t1;") at postgres.c:1170#11 0x00000000008c0191 in PostgresMain (argc=1, argv=0x20cdcd8, dbname=0x20cdb40 "testdb", username=0x209ea98 "xdb") at postgres.c:4182#12 0x000000000081e06c in BackendRun (port=0x20c3b10) at postmaster.c:4361#13 0x000000000081d7df in BackendStartup (port=0x20c3b10) at postmaster.c:4033#14 0x0000000000819bd9 in ServerLoop () at postmaster.c:1706#15 0x000000000081948f in PostmasterMain (argc=1, argv=0x209ca50) at postmaster.c:1379#16 0x0000000000742931 in main (argc=1, argv=0x209ca50) at main.c:228(gdb)
kill进程,重新进入在WaitLatch上设置断点进行跟踪
#########[xdb@localhost ~]$ kill -9 1331#########testdb=# select pg_backend_pid(); pg_backend_pid ---------------- 1377(1 row)#########[xdb@localhost ~]$ gdb -p 1377...(gdb) b WaitLatchBreakpoint 1 at 0x88dbac: file latch.c, line 339.(gdb) #########testdb=# drop table t1;ERROR: table "t1" does not existtestdb=# create table t1(id int);
进入断点
(gdb) b WaitLatchBreakpoint 1 at 0x88dbac: file latch.c, line 339.(gdb) cContinuing.Breakpoint 1, WaitLatch (latch=0x7f462d5b44d4, wakeEvents=17, timeout=-1, wait_event_info=134217761) at latch.c:339339 return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,(gdb)
进入WaitLatchOrSocket
(gdb) stepWaitLatchOrSocket (latch=0x7f462d5b44d4, wakeEvents=17, sock=-1, timeout=-1, wait_event_info=134217761) at latch.c:359359 int ret = 0;(gdb) (gdb) p *latch$1 = {is_set = 0, is_shared = true, owner_pid = 1377}
构建等待事件集
(gdb) n362 WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);(gdb) n364 if (wakeEvents & WL_TIMEOUT)(gdb) 367 timeout = -1;(gdb) 369 if (wakeEvents & WL_LATCH_SET)(gdb) p *set$2 = {nevents = 0, nevents_space = 3, events = 0x2181eb8, latch = 0x0, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *set->events$3 = {pos = 0, events = 0, fd = 0, user_data = 0x0}(gdb) p *set->epoll_ret_events$4 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb) $5 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb) n370 AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,(gdb) 373 if (wakeEvents & WL_POSTMASTER_DEATH && IsUnderPostmaster)(gdb) 374 AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,(gdb) 377 if (wakeEvents & WL_SOCKET_MASK)(gdb) 385 rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);(gdb) p *set$6 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *set->events$7 = {pos = 0, events = 1, fd = 11, user_data = 0x0}(gdb) p *set->epoll_ret_events$8 = {events = 0, data = {ptr = 0x0, fd = 0, u32 = 0, u64 = 0}}(gdb)
进入WaitEventSetWait
(gdb) stepWaitEventSetWait (set=0x2181e90, timeout=-1, occurred_events=0x7ffc96572f40, nevents=1, wait_event_info=134217761) at latch.c:925925 int returned_events = 0;(gdb)
输入参数
(gdb) n928 long cur_timeout = -1;(gdb) p *set$9 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) p *occurred_events$10 = {pos = 35135068, events = 0, fd = -1772664741, user_data = 0x7ffc96572fa0}(gdb)
执行相关判断和设置参数
(gdb) n930 Assert(nevents > 0);(gdb) 936 if (timeout >= 0)(gdb) 943 pgstat_report_wait_start(wait_event_info);(gdb) 946 waiting = true;(gdb)
未有事件出现,则循环
951 while (returned_events == 0)(gdb)
不符合set->latch->is_set为T的条件,继续循环
982 if (set->latch && set->latch->is_set)(gdb) p *set->latch$11 = {is_set = 0, is_shared = true, owner_pid = 1377}(gdb)
进入WaitEventSetWaitBlock
(gdb) n1000 rc = WaitEventSetWaitBlock(set, cur_timeout,(gdb) stepWaitEventSetWaitBlock (set=0x2181e90, cur_timeout=-1, occurred_events=0x7ffc96572f40, nevents=1) at latch.c:10421042 int returned_events = 0;(gdb)
调用epoll_wait,挂起
(gdb) n1048 rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,(gdb) p *set$12 = {nevents = 2, nevents_space = 3, events = 0x2181eb8, latch = 0x7f462d5b44d4, latch_pos = 0, epoll_fd = 37, epoll_ret_events = 0x2181f00}(gdb) (gdb) n
启动standby节点
####[xdb@localhost ~]$ pg_ctl startpg_ctl: another server might be running; trying to start server anyway...
接收到信号
Program received signal SIGUSR1, User defined signal 1.0x00007f4636d48903 in __epoll_wait_nocancel () from /lib64/libc.so.6(gdb) (gdb) nSingle stepping until exit from function __epoll_wait_nocancel,which has no line number information.procsignal_sigusr1_handler (postgres_signal_arg=-1) at procsignal.c:262262 {(gdb)
感谢各位的阅读,以上就是"PostgreSQL同步复制主库挂起分析"的内容了,经过本文的学习后,相信大家对PostgreSQL同步复制主库挂起分析这一问题有了更深刻的体会,具体使用情况还需要大家实践验证。这里是,小编将为大家推送更多相关知识点的文章,欢迎关注!
节点
分析
结构
同步
数据
类型
进程
问题
学习
跟踪
事件
内容
函数
参数
变量
就是
情况
断点
循环
运行
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
网络安全中网络拓扑
青岛直播软件开发公司有哪些
360安全桌面移动软件开发
计算机网络技术专业四级
建立保险数据库
最常用英文单词数据库
高级软件开发工程师 证书
酒店管理系统数据库实施
科东网络安全平台登录
山西电商软件开发费用
员工统计数据库
服务器当路由器
立体化软件开发
广州一微互联网科技有限公司
电脑软件开发可以在家工作吗
服务器上的文件夹怎么找回
符合免征增值税的软件开发
数据库导入和到处是什么意思
抖音直播伴侣软件开发
服务器集群的安全防御
shell读取文本更新数据库
查询数据库第二条数据
广州资深软件开发待遇
中信科技 工业互联网
三星ai服务器
linux服务器安全管理
华为数据库必须有主键么
世界上的12个服务器
服务器vdo
服务器安全备份