千家信息网

PostgreSQL 源码解读(136)- Buffer Manager#1(ReadBufferExtended函数)

发表于:2025-01-20 作者:千家信息网编辑
千家信息网最后更新 2025年01月20日,本节简单介绍了PostgreSQL缓存管理(Buffer Manager)中的其中一个实现函数ReadBufferExtended,该函数返回对应请求关系数据块的buffer.。一、数据结构Relat
千家信息网最后更新 2025年01月20日PostgreSQL 源码解读(136)- Buffer Manager#1(ReadBufferExtended函数)

本节简单介绍了PostgreSQL缓存管理(Buffer Manager)中的其中一个实现函数ReadBufferExtended,该函数返回对应请求关系数据块的buffer.。

一、数据结构

Relation
关系的内存结构.

/* * Here are the contents of a relation cache entry. */typedef struct RelationData{    RelFileNode rd_node;        /* relation physical identifier */    /* use "struct" here to avoid needing to include smgr.h: */    struct SMgrRelationData *rd_smgr;   /* cached file handle, or NULL */    int         rd_refcnt;      /* reference count */    BackendId   rd_backend;     /* owning backend id, if temporary relation */    bool        rd_islocaltemp; /* rel is a temp rel of this session */    bool        rd_isnailed;    /* rel is nailed in cache */    bool        rd_isvalid;     /* relcache entry is valid */    char        rd_indexvalid;  /* state of rd_indexlist: 0 = not valid, 1 =                                 * valid, 2 = temporarily forced */    bool        rd_statvalid;   /* is rd_statlist valid? */    /*     * rd_createSubid is the ID of the highest subtransaction the rel has     * survived into; or zero if the rel was not created in the current top     * transaction.  This can be now be relied on, whereas previously it could     * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is     * the ID of the highest subtransaction the relfilenode change has     * survived into, or zero if not changed in the current transaction (or we     * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten     * when a relation has multiple new relfilenodes within a single     * transaction, with one of them occurring in a subsequently aborted     * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;     * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten     */    SubTransactionId rd_createSubid;    /* rel was created in current xact */    SubTransactionId rd_newRelfilenodeSubid;    /* new relfilenode assigned in                                                 * current xact */    Form_pg_class rd_rel;       /* RELATION tuple */    TupleDesc   rd_att;         /* tuple descriptor */    Oid         rd_id;          /* relation's object id */    LockInfoData rd_lockInfo;   /* lock mgr's info for locking relation */    RuleLock   *rd_rules;       /* rewrite rules */    MemoryContext rd_rulescxt;  /* private memory cxt for rd_rules, if any */    TriggerDesc *trigdesc;      /* Trigger info, or NULL if rel has none */    /* use "struct" here to avoid needing to include rowsecurity.h: */    struct RowSecurityDesc *rd_rsdesc;  /* row security policies, or NULL */    /* data managed by RelationGetFKeyList: */    List       *rd_fkeylist;    /* list of ForeignKeyCacheInfo (see below) */    bool        rd_fkeyvalid;   /* true if list has been computed */    MemoryContext rd_partkeycxt;    /* private memory cxt for the below */    struct PartitionKeyData *rd_partkey;    /* partition key, or NULL */    MemoryContext rd_pdcxt;     /* private context for partdesc */    struct PartitionDescData *rd_partdesc;  /* partitions, or NULL */    List       *rd_partcheck;   /* partition CHECK quals */    /* data managed by RelationGetIndexList: */    List       *rd_indexlist;   /* list of OIDs of indexes on relation */    Oid         rd_oidindex;    /* OID of unique index on OID, if any */    Oid         rd_pkindex;     /* OID of primary key, if any */    Oid         rd_replidindex; /* OID of replica identity index, if any */    /* data managed by RelationGetStatExtList: */    List       *rd_statlist;    /* list of OIDs of extended stats */    /* data managed by RelationGetIndexAttrBitmap: */    Bitmapset  *rd_indexattr;   /* columns used in non-projection indexes */    Bitmapset  *rd_projindexattr;   /* columns used in projection indexes */    Bitmapset  *rd_keyattr;     /* cols that can be ref'd by foreign keys */    Bitmapset  *rd_pkattr;      /* cols included in primary key */    Bitmapset  *rd_idattr;      /* included in replica identity index */    Bitmapset  *rd_projidx;     /* Oids of projection indexes */    PublicationActions *rd_pubactions;  /* publication actions */    /*     * rd_options is set whenever rd_rel is loaded into the relcache entry.     * Note that you can NOT look into rd_rel for this data.  NULL means "use     * defaults".     */    bytea      *rd_options;     /* parsed pg_class.reloptions */    /* These are non-NULL only for an index relation: */    Form_pg_index rd_index;     /* pg_index tuple describing this index */    /* use "struct" here to avoid needing to include htup.h: */    struct HeapTupleData *rd_indextuple;    /* all of pg_index tuple */    /*     * index access support info (used only for an index relation)     *     * Note: only default support procs for each opclass are cached, namely     * those with lefttype and righttype equal to the opclass's opcintype. The     * arrays are indexed by support function number, which is a sufficient     * identifier given that restriction.     *     * Note: rd_amcache is available for index AMs to cache private data about     * an index.  This must be just a cache since it may get reset at any time     * (in particular, it will get reset by a relcache inval message for the     * index).  If used, it must point to a single memory chunk palloc'd in     * rd_indexcxt.  A relcache reset will include freeing that chunk and     * setting rd_amcache = NULL.     */    Oid         rd_amhandler;   /* OID of index AM's handler function */    MemoryContext rd_indexcxt;  /* private memory cxt for this stuff */    /* use "struct" here to avoid needing to include amapi.h: */    struct IndexAmRoutine *rd_amroutine;    /* index AM's API struct */    Oid        *rd_opfamily;    /* OIDs of op families for each index col */    Oid        *rd_opcintype;   /* OIDs of opclass declared input data types */    RegProcedure *rd_support;   /* OIDs of support procedures */    FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */    int16      *rd_indoption;   /* per-column AM-specific flags */    List       *rd_indexprs;    /* index expression trees, if any */    List       *rd_indpred;     /* index predicate tree, if any */    Oid        *rd_exclops;     /* OIDs of exclusion operators, if any */    Oid        *rd_exclprocs;   /* OIDs of exclusion ops' procs, if any */    uint16     *rd_exclstrats;  /* exclusion ops' strategy numbers, if any */    void       *rd_amcache;     /* available for use by index AM */    Oid        *rd_indcollation;    /* OIDs of index collations */    /*     * foreign-table support     *     * rd_fdwroutine must point to a single memory chunk palloc'd in     * CacheMemoryContext.  It will be freed and reset to NULL on a relcache     * reset.     */    /* use "struct" here to avoid needing to include fdwapi.h: */    struct FdwRoutine *rd_fdwroutine;   /* cached function pointers, or NULL */    /*     * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new     * version of a table, we need to make any toast pointers inserted into it     * have the existing toast table's OID, not the OID of the transient toast     * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in     * toast pointers inserted into this rel.  (Note it's set on the new     * version of the main heap, not the toast table itself.)  This also     * causes toast_save_datum() to try to preserve toast value OIDs.     */    Oid         rd_toastoid;    /* Real TOAST table's OID, or InvalidOid */    /* use "struct" here to avoid needing to include pgstat.h: */    struct PgStat_TableStatus *pgstat_info; /* statistics collection area */} RelationData;typedef struct RelationData *Relation;

BufferAccessStrategy
buffer访问策略

/* * Buffer identifiers. * Buffer标识符 *  * Zero is invalid, positive is the index of a shared buffer (1..NBuffers), * negative is the index of a local buffer (-1 .. -NLocBuffer). * 0表示无效,正整数表示共享buffer的索引(1..N), *   负数是本地buffer的索引(-1..-N) */typedef int Buffer;#define InvalidBuffer   0/* * Buffer access strategy objects. * Buffer访问策略对象 * * BufferAccessStrategyData is private to freelist.c * BufferAccessStrategyData对freelist.c来说是私有的 */typedef struct BufferAccessStrategyData *BufferAccessStrategy; /* * Private (non-shared) state for managing a ring of shared buffers to re-use. * This is currently the only kind of BufferAccessStrategy object, but someday * we might have more kinds. * 私有状态,用于管理可重用的环形缓冲区. * 目前只有这么一种缓冲区访问策略对象,但未来某一天可以拥有更多. */typedef struct BufferAccessStrategyData{    /* Overall strategy type */    //全局的策略类型    BufferAccessStrategyType btype;    /* Number of elements in buffers[] array */    //buffers[]中的元素个数    int         ring_size;    /*     * Index of the "current" slot in the ring, ie, the one most recently     * returned by GetBufferFromRing.     * 环形缓冲区中当前slot的索引,最近访问的通过函数GetBufferFromRing返回.     */    int         current;    /*     * True if the buffer just returned by StrategyGetBuffer had been in the     * ring already.     * 如正好通过StrategyGetBuffer返回的buffer已在环形缓冲区中,则返回T     */    bool        current_was_in_ring;    /*     * Array of buffer numbers.  InvalidBuffer (that is, zero) indicates we     * have not yet selected a buffer for this ring slot.  For allocation     * simplicity this is palloc'd together with the fixed fields of the     * struct.     * buffer编号数组.     * InvalidBuffer(即:0)表示我们还没有为该slot选择buffer.     * 为了分配的简单性,这是palloc'd与结构的固定字段。     */    Buffer      buffers[FLEXIBLE_ARRAY_MEMBER];}           BufferAccessStrategyData;//Block结构体指针typedef void *Block;/* Possible arguments for GetAccessStrategy() *///GetAccessStrategy()函数可取值的参数typedef enum BufferAccessStrategyType{    //常规的随机访问    BAS_NORMAL,                 /* Normal random access */    //大规模的只读扫描    BAS_BULKREAD,               /* Large read-only scan (hint bit updates are                                 * ok) */    //大量的多块写(如 COPY IN)    BAS_BULKWRITE,              /* Large multi-block write (e.g. COPY IN) */    //VACUUM    BAS_VACUUM                  /* VACUUM */} BufferAccessStrategyType;

ReadBufferMode
ReadBufferExtended函数所可能使用的读取模式.

/* * In RBM_NORMAL mode, the page is read from disk, and the page header is * validated.  An error is thrown if the page header is not valid.  (But * note that an all-zero page is considered "valid"; see PageIsVerified().) * 在RBM_NORMAL模式,page从磁盘中读取,page头部已被验证有效.假如page头部是无效的,那会抛出错误. * (但是需要注意,初始化的page被认为是有效的;详细参见PageIsVerified函数) * * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * RBM_ZERO_ON_ERROR类似于正常模式,但如果page header是无效的,则初始化page(置0),而不是报错. * 在调用者准备修复错误时,针对非关键数据使用. * * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's * filled with zeros instead of reading it from disk.  Useful when the caller * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. * The page is returned locked to ensure that the caller has a chance to * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * 在RBM_ZERO_AND_LOCK模式,如果page还没有处于buffer cache,填充0而不是从磁盘中读取. * 在调用者从scratch填充page时使用,因为这样可以节省I/O并避免不必要的page-on-disk的header错误. * page会被锁定并返回,确保在page可见前由调用者初始化此page. * 特别注意:不要在在关系文件标记位(EOF)后使用这种模式读取page,这会在md.c中,修改page并写出该page后出现问题. * 但是,P_NEW是可以的. * * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires * a cleanup-strength lock on the page. * RBM_ZERO_AND_CLEANUP_LOCK模式与RBM_ZERO_AND_LOCK模式类似,但在page上请求cleanup-strength lock. * * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * RBM_NORMAL_NO_LOG模式与RBM_NORMAL一致. *//* Possible modes for ReadBufferExtended() */typedef enum{    RBM_NORMAL,                 /* Normal read */    RBM_ZERO_AND_LOCK,          /* Don't read from disk, caller will                                 * initialize. Also locks the page. */    RBM_ZERO_AND_CLEANUP_LOCK,  /* Like RBM_ZERO_AND_LOCK, but locks the page                                 * in "cleanup" mode */    RBM_ZERO_ON_ERROR,          /* Read, but return an all-zeros page on error */    RBM_NORMAL_NO_LOG           /* Don't log page as invalid during WAL                                 * replay; otherwise same as RBM_NORMAL */} ReadBufferMode;

二、源码解读

ReadBufferExtended返回对应请求关系数据块的buffer,实现逻辑比较简单,详见代码.
主要的实现逻辑在ReadBuffer_common中,该函数后续再行介绍.

/* * ReadBufferExtended -- returns a buffer containing the requested *      block of the requested relation.  If the blknum *      requested is P_NEW, extend the relation file and *      allocate a new block.  (Caller is responsible for *      ensuring that only one backend tries to extend a *      relation at the same time!) * ReadBufferExtended -- 返回对应请求关系数据块的buffer. *      如果blknum是P_NEW,则扩展关系文件并分配新块. *      (调用者有责任确保只有一个后台进程在同一时刻尝试扩展关系) * * Returns: the buffer number for the buffer containing *      the block read.  The returned buffer has been pinned. *      Does not return on error --- elog's instead. * 返回:对应block的buffer编号.返回的buffer已被pinned.不需要返回错误,因为elog已进行处理. * * Assume when this function is called, that reln has been opened already. * 假定调用该函数时,关系reln已被打开. * * In RBM_NORMAL mode, the page is read from disk, and the page header is * validated.  An error is thrown if the page header is not valid.  (But * note that an all-zero page is considered "valid"; see PageIsVerified().) * 在RBM_NORMAL模式,page从磁盘中读取,page头部已被验证有效.假如page头部是无效的,那会抛出错误. * (但是需要注意,初始化的page被认为是有效的;详细参见PageIsVerified函数) * * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not * valid, the page is zeroed instead of throwing an error. This is intended * for non-critical data, where the caller is prepared to repair errors. * RBM_ZERO_ON_ERROR类似于正常模式,但如果page header是无效的,则初始化page(置0),而不是报错. * 在调用者准备修复错误时,针对非关键数据使用. * * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's * filled with zeros instead of reading it from disk.  Useful when the caller * is going to fill the page from scratch, since this saves I/O and avoids * unnecessary failure if the page-on-disk has corrupt page headers. * The page is returned locked to ensure that the caller has a chance to * initialize the page before it's made visible to others. * Caution: do not use this mode to read a page that is beyond the relation's * current physical EOF; that is likely to cause problems in md.c when * the page is modified and written out. P_NEW is OK, though. * 在RBM_ZERO_AND_LOCK模式,如果page还没有处于buffer cache,填充0而不是从磁盘中读取. * 在调用者从scratch填充page时使用,因为这样可以节省I/O并避免不必要的page-on-disk的header错误. * page会被锁定并返回,确保在page可见前由调用者初始化此page. * 特别注意:不要在在关系文件标记位(EOF)后使用这种模式读取page,这会在md.c中,修改page并写出该page后出现问题. * 但是,P_NEW是可以的. * * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires * a cleanup-strength lock on the page. * RBM_ZERO_AND_CLEANUP_LOCK模式与RBM_ZERO_AND_LOCK模式类似,但在page上请求cleanup-strength lock. * * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. * RBM_NORMAL_NO_LOG模式与RBM_NORMAL一致. * * If strategy is not NULL, a nondefault buffer access strategy is used. * See buffer/README for details. * 如strategy非空,则使用非默认的buffer访问策略.详细参见buffer/README. */BufferReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,                   ReadBufferMode mode, BufferAccessStrategy strategy){    bool        hit;    Buffer      buf;    /* Open it at the smgr level if not already done */    //打开relation,级别为smgr    RelationOpenSmgr(reln);    /*     * Reject attempts to read non-local temporary relations; we would be     * likely to get wrong data since we have no visibility into the owning     * session's local buffers.     * 拒绝尝试访问非本地临时relations.     * 由于没有自己会话的本地缓存可见信息,因此读取临时表会得到错误的数据.     */    if (RELATION_IS_OTHER_TEMP(reln))        ereport(ERROR,                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),                 errmsg("cannot access temporary tables of other sessions")));    /*     * Read the buffer, and update pgstat counters to reflect a cache hit or     * miss.     * 调用ReadBuffer_common读取buffer,更新pgstat计数器以反映命中还是缺失.     */    pgstat_count_buffer_read(reln);    buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,                            forkNum, blockNum, mode, strategy, &hit);    if (hit)        pgstat_count_buffer_hit(reln);    return buf;}

三、跟踪分析

使用bt查看调用栈

(gdb) bt#0  ReadBufferExtended (reln=0x7f497fe72788, forkNum=MAIN_FORKNUM, blockNum=0, mode=RBM_NORMAL, strategy=0x0)    at bufmgr.c:647#1  0x00000000004d974f in heapgetpage (scan=0x1d969d8, page=0) at heapam.c:379#2  0x00000000004daeb2 in heapgettup_pagemode (scan=0x1d969d8, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:837#3  0x00000000004dcf2b in heap_getnext (scan=0x1d969d8, direction=ForwardScanDirection) at heapam.c:1842#4  0x000000000070ec39 in SeqNext (node=0x1d95890) at nodeSeqscan.c:80#5  0x00000000006e0ab0 in ExecScanFetch (node=0x1d95890, accessMtd=0x70eba9 , recheckMtd=0x70ec74 )    at execScan.c:95#6  0x00000000006e0b22 in ExecScan (node=0x1d95890, accessMtd=0x70eba9 , recheckMtd=0x70ec74 )    at execScan.c:145#7  0x000000000070ecbe in ExecSeqScan (pstate=0x1d95890) at nodeSeqscan.c:129#8  0x00000000006dee2a in ExecProcNodeFirst (node=0x1d95890) at execProcnode.c:445#9  0x00000000007021b8 in ExecProcNode (node=0x1d95890) at ../../../src/include/executor/executor.h:237#10 0x00000000007022dd in ExecLimit (pstate=0x1d95680) at nodeLimit.c:95#11 0x00000000006dee2a in ExecProcNodeFirst (node=0x1d95680) at execProcnode.c:445#12 0x00000000006d3d8d in ExecProcNode (node=0x1d95680) at ../../../src/include/executor/executor.h:237#13 0x00000000006d65c5 in ExecutePlan (estate=0x1d95468, planstate=0x1d95680, use_parallel_mode=false,     operation=CMD_SELECT, sendTuples=true, numberTuples=0, direction=ForwardScanDirection, dest=0x1d00ea8,     execute_once=true) at execMain.c:1723#14 0x00000000006d4357 in standard_ExecutorRun (queryDesc=0x1cfdc28, direction=ForwardScanDirection, count=0,     execute_once=true) at execMain.c:364#15 0x00000000006d417f in ExecutorRun (queryDesc=0x1cfdc28, direction=ForwardScanDirection, count=0, execute_once=true)    at execMain.c:307#16 0x00000000008bffd4 in PortalRunSelect (portal=0x1d3ebf8, forward=true, count=0, dest=0x1d00ea8) at pquery.c:932#17 0x00000000008bfc72 in PortalRun (portal=0x1d3ebf8, count=9223372036854775807, isTopLevel=true, run_once=true,     dest=0x1d00ea8, altdest=0x1d00ea8, completionTag=0x7ffc1fc513d0 "") at pquery.c:773#18 0x00000000008b9cd4 in exec_simple_query (query_string=0x1cd8ec8 "select * from t1 limit 10;") at postgres.c:1145---Type  to continue, or q  to quit---#19 0x00000000008bdf5f in PostgresMain (argc=1, argv=0x1d05278, dbname=0x1d050e0 "testdb", username=0x1cd5ba8 "xdb")    at postgres.c:4182#20 0x000000000081c16d in BackendRun (port=0x1cfae00) at postmaster.c:4361#21 0x000000000081b8e0 in BackendStartup (port=0x1cfae00) at postmaster.c:4033#22 0x0000000000817cda in ServerLoop () at postmaster.c:1706#23 0x0000000000817590 in PostmasterMain (argc=1, argv=0x1cd3b60) at postmaster.c:1379#24 0x0000000000741003 in main (argc=1, argv=0x1cd3b60) at main.c:228(gdb)

逻辑较为简单,这里不再详细跟踪.

四、参考资料

PG Source Code

0