
PostgreSQL中Old Master节点分析

发表于:2024-10-07 作者:千家信息网编辑
千家信息网最后更新 2024年10月07日
PostgreSQL中Old Master节点分析

基于streaming replication搭建的PostgreSQL HA环境,如出现网络访问/硬件故障等原因导致Standby节点升级为Master节点,但Old Master节点数据库并未损坏,在排除故障后Old Master节点可以通过pg_rewind工具而不需要通过备份的方式成为New Master节点的Standby节点.


在PostgreSQL HA环境中,Standby节点升级为Master节点后,时间线会切换为新的时间线,比如从1变为2.而Old Master节点的时间线仍然为原来的时间线,比如仍为1,那么使用pg_rewind工具,Old Master节点如何从New Master节点读取相关的数据成为新的Standby节点?
1.确定New Master和Old Master数据一致性的Checkpoint位置.在该位置上,New Master和Old Master数据完全一致.这可以通过读取新Old Master节点时间线历史文件可以获得,该文件位于$PGDATA/pg_wal/目录下,文件名称为XX.history
2.Old Master节点根据上一步获取的Checkpoint读取本机日志文件WAL Record,获取在此Checkpoint之后出现变化的Block,并以链表的方式存储Block编号等信息
3.根据第2步获取的Block信息从New Master节点拷贝相应的Block,替换Old Master节点相应的Block
4.拷贝New Master节点上除数据文件外的所有其他文件,包括配置文件等(如果拷贝数据文件,与备份方式搭建区别不大)
5.Old Master启动数据库,应用从Checkpoint开始后的WAL Record.

在执行主备切换后,New Master节点的时间线切换为n + 1,通过pg_rewind可使Old Master在分叉点开始与New Master同步,成为New Standby节点.


64bit的WAL Record寻址空间地址.

/* * Pointer to a location in the XLOG.  These pointers are 64 bits wide, * because we don't want them ever to overflow. * 指向XLOG中的位置. * 这些指针大小为64bit,以确保指针不会溢出. */typedef uint64 XLogRecPtr;


typedef uint32 TimeLineID;



intmain(int argc, char **argv){    static struct option long_options[] = {        {"help", no_argument, NULL, '?'},        {"target-pgdata", required_argument, NULL, 'D'},        {"source-pgdata", required_argument, NULL, 1},        {"source-server", required_argument, NULL, 2},        {"version", no_argument, NULL, 'V'},        {"dry-run", no_argument, NULL, 'n'},        {"no-sync", no_argument, NULL, 'N'},        {"progress", no_argument, NULL, 'P'},        {"debug", no_argument, NULL, 3},        {NULL, 0, NULL, 0}    };//命令选项    int         option_index;//选项编号    int         c;//字符ASCII码    XLogRecPtr  divergerec;//分支点    int         lastcommontliIndex;    XLogRecPtr  chkptrec;//checkpoint Record位置    TimeLineID  chkpttli;//时间线    XLogRecPtr  chkptredo;checkpoint REDO位置    size_t      size;    char       *buffer;//缓冲区    bool        rewind_needed;//是否需要rewind    XLogRecPtr  endrec;//结束点    TimeLineID  endtli;//结束时间线    ControlFileData ControlFile_new;//新的控制文件    set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind"));    progname = get_progname(argv[0]);    /* Process command-line arguments */    //处理命令行参数    if (argc > 1)    {        if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)        {            usage(progname);            exit(0);        }        if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)        {            puts("pg_rewind (PostgreSQL) " PG_VERSION);            exit(0);        }    }    while ((c = getopt_long(argc, argv, "D:nNP", long_options, &option_index)) != -1)    {        switch (c)        {            case '?':                fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);                exit(1);            case 'P':                showprogress = true;                break;            case 'n':                dry_run = true;                break;            case 'N':                do_sync = false;                break;            case 3:                debug = true;                break;            case 'D':           /* -D or --target-pgdata */                datadir_target = pg_strdup(optarg);                break;            case 1:             /* --source-pgdata */                datadir_source = pg_strdup(optarg);                break;            case 2:             /* --source-server */                connstr_source = pg_strdup(optarg);                break;        }    }    if (datadir_source == NULL && connstr_source == NULL)    {        fprintf(stderr, _("%s: no source specified (--source-pgdata or --source-server)\n"), progname);        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);        exit(1);    }    if (datadir_source != NULL && connstr_source != NULL)    {        fprintf(stderr, _("%s: only one of --source-pgdata or --source-server can be specified\n"), progname);        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);        exit(1);    }    if (datadir_target == NULL)    {        fprintf(stderr, _("%s: no target data directory specified (--target-pgdata)\n"), progname);        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);        exit(1);    }    if (optind < argc)    {        fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"),                progname, argv[optind]);        fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);        exit(1);    }    /*     * Don't allow pg_rewind to be run as root, to avoid overwriting the     * ownership of files in the data directory. We need only check for root     * -- any other user won't have sufficient permissions to modify files in     * the data directory.     * 不需要以root用户运行pg_rewind,避免覆盖数据目录中的文件owner.     * 只需要检查root用户,其他用户没有足够的权限更新数据目录中的文件.     */#ifndef WIN32    if (geteuid() == 0)    {        //root用户        fprintf(stderr, _("cannot be executed by \"root\"\n"));        fprintf(stderr, _("You must run %s as the PostgreSQL superuser.\n"),                progname);        exit(1);    }#endif    get_restricted_token(progname);    /* Set mask based on PGDATA permissions */    //根据PGDATA的权限设置权限mask    if (!GetDataDirectoryCreatePerm(datadir_target))    {        fprintf(stderr, _("%s: could not read permissions of directory \"%s\": %s\n"),                progname, datadir_target, strerror(errno));        exit(1);    }    umask(pg_mode_mask);    /* Connect to remote server */    //连接到远程服务器    if (connstr_source)        libpqConnect(connstr_source);    /*     * Ok, we have all the options and we're ready to start. Read in all the     * information we need from both clusters.     * 现在,我们有了相关的执行运行,准备开始运行.     * 从两个db clusters中读取所有需要的信息.     */    //读取目标控制文件    buffer = slurpFile(datadir_target, "global/pg_control", &size);    digestControlFile(&ControlFile_target, buffer, size);    pg_free(buffer);    //读取源控制文件    buffer = fetchFile("global/pg_control", &size);    digestControlFile(&ControlFile_source, buffer, size);    pg_free(buffer);    sanityChecks();    /*     * If both clusters are already on the same timeline, there's nothing to     * do.     * 如果两个clusters已经是同一个时间线,没有什么好做的了,报错.     */    if (ControlFile_target.checkPointCopy.ThisTimeLineID == ControlFile_source.checkPointCopy.ThisTimeLineID)    {        printf(_("source and target cluster are on the same timeline\n"));        rewind_needed = false;    }    else    {        //找到分叉点        findCommonAncestorTimeline(&divergerec, &lastcommontliIndex);        printf(_("servers diverged at WAL location %X/%X on timeline %u\n"),               (uint32) (divergerec >> 32), (uint32) divergerec,               targetHistory[lastcommontliIndex].tli);        /*         * Check for the possibility that the target is in fact a direct         * ancestor of the source. In that case, there is no divergent history         * in the target that needs rewinding.         * 检查目标是源的直接祖先的可能性.         * 在这种情况下,在需要调整的目标中就没有不同的历史.         */        if (ControlFile_target.checkPoint >= divergerec)        {            //如果目标的checkpoint > 分叉点,则需要rewind            rewind_needed = true;        }        else        {            //目标的checkpoint <= 分叉点            XLogRecPtr  chkptendrec;            /* Read the checkpoint record on the target to see where it ends. */            //读取目标的checkpoint记录,检查在哪结束?            chkptendrec = readOneRecord(datadir_target,                                        ControlFile_target.checkPoint,                                        targetNentries - 1);            /*             * If the histories diverged exactly at the end of the shutdown             * checkpoint record on the target, there are no WAL records in             * the target that don't belong in the source's history, and no             * rewind is needed.             * 如果正好在shutdown checkpoint Record处出现分叉,             *   那么在目标cluster中没有WAL Record属于源cluster历史,             *   不需要进行rewind操作,否则需要rewind.             */            if (chkptendrec == divergerec)                rewind_needed = false;            else                rewind_needed = true;        }    }    if (!rewind_needed)    {        //不需要rewind,退出        printf(_("no rewind required\n"));        exit(0);    }    //找到目标cluster最后的checkpoint点    findLastCheckpoint(datadir_target, divergerec,                       lastcommontliIndex,                       &chkptrec, &chkpttli, &chkptredo);    printf(_("rewinding from last common checkpoint at %X/%X on timeline %u\n"),           (uint32) (chkptrec >> 32), (uint32) chkptrec,           chkpttli);    /*     * Build the filemap, by comparing the source and target data directories.     * 通过对比源和目标数据目录构建filemap     */    //创建filemap    filemap_create();    pg_log(PG_PROGRESS, "reading source file list\n");    fetchSourceFileList();    pg_log(PG_PROGRESS, "reading target file list\n");    traverse_datadir(datadir_target, &process_target_file);    /*     * Read the target WAL from last checkpoint before the point of fork, to     * extract all the pages that were modified on the target cluster after     * the fork. We can stop reading after reaching the final shutdown record.     * XXX: If we supported rewinding a server that was not shut down cleanly,     * we would need to replay until the end of WAL here.     * 从在分叉点之前的最后一个checkpoint开始读取目标WAL Record,     *   提取目标cluster上在分叉后所有被修改的pages.     * 在到达最后一个shutdown record时停止读取.     * XXX: 如果我们支持非正常关闭的数据库rewind,需要在这里重放WAL Record到WAL的末尾.     */    //构造filemap    pg_log(PG_PROGRESS, "reading WAL in target\n");    extractPageMap(datadir_target, chkptrec, lastcommontliIndex,                   ControlFile_target.checkPoint);    filemap_finalize();    if (showprogress)        calculate_totals();    /* this is too verbose even for verbose mode */    //如为debug模式,则打印filemap    if (debug)        print_filemap();    /*     * Ok, we're ready to start copying things over.     * 现在可以开始拷贝了.     */    if (showprogress)    {        pg_log(PG_PROGRESS, "need to copy %lu MB (total source directory size is %lu MB)\n",               (unsigned long) (filemap->fetch_size / (1024 * 1024)),               (unsigned long) (filemap->total_size / (1024 * 1024)));        fetch_size = filemap->fetch_size;        fetch_done = 0;    }    /*     * This is the point of no return. Once we start copying things, we have     * modified the target directory and there is no turning back!     * 到了这里,已无回头路可走了.     * 一旦开始拷贝,就必须更新目标路径,无法回头!     */    //    executeFileMap();    progress_report(true);    //创建backup_label文件并更新控制文件    pg_log(PG_PROGRESS, "\ncreating backup label and updating control file\n");    createBackupLabel(chkptredo, chkpttli, chkptrec);    /*     * Update control file of target. Make it ready to perform archive     * recovery when restarting.     * 更新目标控制文件.在重启时可执行归档恢复.     *     * minRecoveryPoint is set to the current WAL insert location in the     * source server. Like in an online backup, it's important that we recover     * all the WAL that was generated while we copied the files over.     * minRecoveryPoint设置为目标服务器上当前WAL插入的位置.     * 与在线backup类似,在拷贝和覆盖文件时根据所有生成的WAL日志进行恢复是很重要的.     */    //更新控制文件    memcpy(&ControlFile_new, &ControlFile_source, sizeof(ControlFileData));    if (connstr_source)    {        //获取源WAL插入的位置        endrec = libpqGetCurrentXlogInsertLocation();        //获取时间线        endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;    }    else    {        endrec = ControlFile_source.checkPoint;        endtli = ControlFile_source.checkPointCopy.ThisTimeLineID;    }    //更新控制文件    ControlFile_new.minRecoveryPoint = endrec;    ControlFile_new.minRecoveryPointTLI = endtli;    ControlFile_new.state = DB_IN_ARCHIVE_RECOVERY;    update_controlfile(datadir_target, progname, &ControlFile_new, do_sync);    pg_log(PG_PROGRESS, "syncing target data directory\n");    //同步数据目录(除数据文件之外)    syncTargetDirectory();    printf(_("Done!\n"));    return 0;}

