导航：首页 > 数据库 >

PostgreSQL 源码解读（2）- 插入数据#2（RelationPutHeapTuple）

发表于：2025-01-31 作者：千家信息网编辑

千家信息网最后更新 2025年01月31日，本文简单介绍了PG插入数据部分的源码，主要内容包括RelationPutHeapTuple函数的实现逻辑。一、数据结构/宏定义/通用函数RelationPutHeapTuple函数在hio.c文件中，

千家信息网最后更新 2025年01月31日PostgreSQL 源码解读（2）- 插入数据#2（RelationPutHeapTuple）

本文简单介绍了PG插入数据部分的源码，主要内容包括RelationPutHeapTuple函数的实现逻辑。

一、数据结构/宏定义/通用函数

RelationPutHeapTuple函数在hio.c文件中，相关的数据结构、宏定义如下：

1、Relation数据表数据结构封装 typedef struct RelationData {     RelFileNode rd_node;        /* relation physical identifier */     /* use "struct" here to avoid needing to include smgr.h: */     struct SMgrRelationData *rd_smgr;   /* cached file handle, or NULL */     int         rd_refcnt;      /* reference count */     BackendId   rd_backend;     /* owning backend id, if temporary relation */     bool        rd_islocaltemp; /* rel is a temp rel of this session */     bool        rd_isnailed;    /* rel is nailed in cache */     bool        rd_isvalid;     /* relcache entry is valid */     char        rd_indexvalid;  /* state of rd_indexlist: 0 = not valid, 1 =                                  * valid, 2 = temporarily forced */     bool        rd_statvalid;   /* is rd_statlist valid? */      /*      * rd_createSubid is the ID of the highest subtransaction the rel has      * survived into; or zero if the rel was not created in the current top      * transaction.  This can be now be relied on, whereas previously it could      * be "forgotten" in earlier releases. Likewise, rd_newRelfilenodeSubid is      * the ID of the highest subtransaction the relfilenode change has      * survived into, or zero if not changed in the current transaction (or we      * have forgotten changing it). rd_newRelfilenodeSubid can be forgotten      * when a relation has multiple new relfilenodes within a single      * transaction, with one of them occurring in a subsequently aborted      * subtransaction, e.g. BEGIN; TRUNCATE t; SAVEPOINT save; TRUNCATE t;      * ROLLBACK TO save; -- rd_newRelfilenode is now forgotten      */     SubTransactionId rd_createSubid;    /* rel was created in current xact */     SubTransactionId rd_newRelfilenodeSubid;    /* new relfilenode assigned in                                                  * current xact */      Form_pg_class rd_rel;       /* RELATION tuple */     TupleDesc   rd_att;         /* tuple descriptor */     Oid         rd_id;          /* relation's object id */     LockInfoData rd_lockInfo;   /* lock mgr's info for locking relation */     RuleLock   *rd_rules;       /* rewrite rules */     MemoryContext rd_rulescxt;  /* private memory cxt for rd_rules, if any */     TriggerDesc *trigdesc;      /* Trigger info, or NULL if rel has none */     /* use "struct" here to avoid needing to include rowsecurity.h: */     struct RowSecurityDesc *rd_rsdesc;  /* row security policies, or NULL */      /* data managed by RelationGetFKeyList: */     List       *rd_fkeylist;    /* list of ForeignKeyCacheInfo (see below) */     bool        rd_fkeyvalid;   /* true if list has been computed */      MemoryContext rd_partkeycxt;    /* private memory cxt for the below */     struct PartitionKeyData *rd_partkey;    /* partition key, or NULL */     MemoryContext rd_pdcxt;     /* private context for partdesc */     struct PartitionDescData *rd_partdesc;  /* partitions, or NULL */     List       *rd_partcheck;   /* partition CHECK quals */      /* data managed by RelationGetIndexList: */     List       *rd_indexlist;   /* list of OIDs of indexes on relation */     Oid         rd_oidindex;    /* OID of unique index on OID, if any */     Oid         rd_pkindex;     /* OID of primary key, if any */     Oid         rd_replidindex; /* OID of replica identity index, if any */      /* data managed by RelationGetStatExtList: */     List       *rd_statlist;    /* list of OIDs of extended stats */      /* data managed by RelationGetIndexAttrBitmap: */     Bitmapset  *rd_indexattr;   /* columns used in non-projection indexes */     Bitmapset  *rd_projindexattr;   /* columns used in projection indexes */     Bitmapset  *rd_keyattr;     /* cols that can be ref'd by foreign keys */     Bitmapset  *rd_pkattr;      /* cols included in primary key */     Bitmapset  *rd_idattr;      /* included in replica identity index */     Bitmapset  *rd_projidx;     /* Oids of projection indexes */      PublicationActions *rd_pubactions;  /* publication actions */      /*      * rd_options is set whenever rd_rel is loaded into the relcache entry.      * Note that you can NOT look into rd_rel for this data.  NULL means "use      * defaults".      */     bytea      *rd_options;     /* parsed pg_class.reloptions */      /* These are non-NULL only for an index relation: */     Form_pg_index rd_index;     /* pg_index tuple describing this index */     /* use "struct" here to avoid needing to include htup.h: */     struct HeapTupleData *rd_indextuple;    /* all of pg_index tuple */      /*      * index access support info (used only for an index relation)      *      * Note: only default support procs for each opclass are cached, namely      * those with lefttype and righttype equal to the opclass's opcintype. The      * arrays are indexed by support function number, which is a sufficient      * identifier given that restriction.      *      * Note: rd_amcache is available for index AMs to cache private data about      * an index.  This must be just a cache since it may get reset at any time      * (in particular, it will get reset by a relcache inval message for the      * index).  If used, it must point to a single memory chunk palloc'd in      * rd_indexcxt.  A relcache reset will include freeing that chunk and      * setting rd_amcache = NULL.      */     Oid         rd_amhandler;   /* OID of index AM's handler function */     MemoryContext rd_indexcxt;  /* private memory cxt for this stuff */     /* use "struct" here to avoid needing to include amapi.h: */     struct IndexAmRoutine *rd_amroutine;    /* index AM's API struct */     Oid        *rd_opfamily;    /* OIDs of op families for each index col */     Oid        *rd_opcintype;   /* OIDs of opclass declared input data types */     RegProcedure *rd_support;   /* OIDs of support procedures */     FmgrInfo   *rd_supportinfo; /* lookup info for support procedures */     int16      *rd_indoption;   /* per-column AM-specific flags */     List       *rd_indexprs;    /* index expression trees, if any */     List       *rd_indpred;     /* index predicate tree, if any */     Oid        *rd_exclops;     /* OIDs of exclusion operators, if any */     Oid        *rd_exclprocs;   /* OIDs of exclusion ops' procs, if any */     uint16     *rd_exclstrats;  /* exclusion ops' strategy numbers, if any */     void       *rd_amcache;     /* available for use by index AM */     Oid        *rd_indcollation;    /* OIDs of index collations */      /*      * foreign-table support      *      * rd_fdwroutine must point to a single memory chunk palloc'd in      * CacheMemoryContext.  It will be freed and reset to NULL on a relcache      * reset.      */      /* use "struct" here to avoid needing to include fdwapi.h: */     struct FdwRoutine *rd_fdwroutine;   /* cached function pointers, or NULL */      /*      * Hack for CLUSTER, rewriting ALTER TABLE, etc: when writing a new      * version of a table, we need to make any toast pointers inserted into it      * have the existing toast table's OID, not the OID of the transient toast      * table.  If rd_toastoid isn't InvalidOid, it is the OID to place in      * toast pointers inserted into this rel.  (Note it's set on the new      * version of the main heap, not the toast table itself.)  This also      * causes toast_save_datum() to try to preserve toast value OIDs.      */     Oid         rd_toastoid;    /* Real TOAST table's OID, or InvalidOid */      /* use "struct" here to avoid needing to include pgstat.h: */     struct PgStat_TableStatus *pgstat_info; /* statistics collection area */ } RelationData; typedef struct RelationData *Relation;2、Buffer实际类型为整型，共享缓冲区的index，0为非法Buffer。 /*  * Buffer identifiers.  *  * Zero is invalid, positive is the index of a shared buffer (1..NBuffers),  * negative is the index of a local buffer (-1 .. -NLocBuffer).  */ typedef int Buffer;  #define InvalidBuffer   03、HeapTupleHeaderHeap（还有一种是Index）类型Tuple的头部数据，在Page结构中已作详细分析。 struct HeapTupleHeaderData {     union     {         HeapTupleFields t_heap;         DatumTupleFields t_datum;     }           t_choice;      ItemPointerData t_ctid;     /* current TID of this or newer tuple (or a                                  * speculative insertion token) */     /* Fields below here must match MinimalTupleData! */  #define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2     uint16      t_infomask2;    /* number of attributes + various flags */  #define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3     uint16      t_infomask;     /* various flag bits, see below */  #define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4     uint8       t_hoff;         /* sizeof header incl. bitmap, padding */      /* ^ - 23 bytes - ^ */  #define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5     bits8       t_bits[FLEXIBLE_ARRAY_MEMBER];  /* bitmap of NULLs */      /* MORE DATA FOLLOWS AT END OF STRUCT */ };4、ItemPointerData数据行指针数据结构，ip_blkid是数据块ID，ip_posid是Tuple在数据块中的偏移（其实是类似数组中的序号）。typedef struct ItemPointerData {     BlockIdData ip_blkid;     OffsetNumber ip_posid; }  ItemPointerData;  typedef ItemPointerData *ItemPointer; typedef struct BlockIdData {     uint16      bi_hi;     uint16      bi_lo; } BlockIdData;  typedef BlockIdData *BlockId; /* block identifier */5、HeapTuple存储在Heap中的Tuple（Row）数据结构：typedef struct HeapTupleData {     uint32      t_len;          /* length of *t_data */     ItemPointerData t_self;     /* SelfItemPointer */     Oid         t_tableOid;     /* table the tuple came from */ #define FIELDNO_HEAPTUPLEDATA_DATA 3     HeapTupleHeader t_data;     /* -> tuple header and data */ } HeapTupleData;  typedef HeapTupleData *HeapTuple;  #define HEAPTUPLESIZE   MAXALIGN(sizeof(HeapTupleData))6、HeapTupleHeaderIsSpeculative #define HeapTupleHeaderIsSpeculative(tup) \ ( \  (ItemPointerGetOffsetNumberNoCheck(&(tup)->t_ctid) == SpecTokenOffsetNumber) \ ) #define ItemPointerGetOffsetNumberNoCheck(pointer) \ ( \  (pointer)->ip_posid \ )7、BufferGetPage//获取与该buffer（有符号整型）对应的page #define BufferGetPage(buffer) ((Page)BufferGetBlock(buffer)) #define BufferGetBlock(buffer) \ ( \  AssertMacro(BufferIsValid(buffer)), \  BufferIsLocal(buffer) ? \  LocalBufferBlockPointers[-(buffer) - 1] \  : \  (Block) (BufferBlocks + ((Size) ((buffer) - 1)) * BLCKSZ) \ ) #define BufferIsLocal(buffer) ((buffer) < 0) typedef void *Block;//指向任意类型的指针 Block *LocalBufferBlockPointers = NULL;//指针的指针8、BufferGetBlockNumber /*  * BufferGetBlockNumber  *      Returns the block number associated with a buffer.  *  * Note:  *      Assumes that the buffer is valid and pinned, else the  *      value may be obsolete immediately...  */ BlockNumber BufferGetBlockNumber(Buffer buffer) {     BufferDesc *bufHdr;      Assert(BufferIsPinned(buffer));      if (BufferIsLocal(buffer))         bufHdr = GetLocalBufferDescriptor(-buffer - 1);     else         bufHdr = GetBufferDescriptor(buffer - 1);      /* pinned, so OK to read tag without spinlock */     return bufHdr->tag.blockNum; }9、BlockIdSet /*  * BlockIdSet  *      Sets a block identifier to the specified value.  */ #define BlockIdSet(blockId, blockNumber) \ ( \     AssertMacro(PointerIsValid(blockId)), \     (blockId)->bi_hi = (blockNumber) >> 16, \//右移16位，得到高位     (blockId)->bi_lo = (blockNumber) & 0xffff \//高16位全部置0，得到低位 )10、ItemPointerSet /*  * ItemPointerSet  * Sets a disk item pointer to the specified block and offset.  */ #define ItemPointerSet(pointer, blockNumber, offNum) \ ( \  AssertMacro(PointerIsValid(pointer)), \  BlockIdSet(&((pointer)->ip_blkid), blockNumber), \  (pointer)->ip_posid = offNum \ )11、PageGetItemId获取行指针（ItemIdData指针） /*  * PageGetItemId  * Returns an item identifier of a page.  */ #define PageGetItemId(page, offsetNumber) \  ((ItemId) (&((PageHeader) (page))->pd_linp[(offsetNumber) - 1]))12、PageGetItem根据ItemId获取相应的Item（Tuple） /*  * PageGetItem  *      Retrieves an item on the given page.  *  * Note:  *      This does not change the status of any of the resources passed.  *      The semantics may change in the future.  */ #define PageGetItem(page, itemId) \ ( \     AssertMacro(PageIsValid(page)), \     AssertMacro(ItemIdHasStorage(itemId)), \     (Item)(((char *)(page)) + ItemIdGetOffset(itemId)) \ ) #define ItemIdGetOffset(itemId) \  ((itemId)->lp_off)

二、源码解读

/* * RelationPutHeapTuple - place tuple at specified page * * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!  Must PANIC on failure!!! * * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer. */voidRelationPutHeapTuple(Relation relation,                     Buffer buffer,                     HeapTuple tuple,                     bool token){    Page        pageHeader;//页头    OffsetNumber offnum;//行偏移    /*     * A tuple that's being inserted speculatively should already have its     * token set.     */    //TODO token & speculatively有待考究    Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data));    /* Add the tuple to the page */    //根据buffer获取相应的page（页头）    pageHeader = BufferGetPage(buffer);    //插入数据,PageAddItem函数上一节已介绍，函数成功返回行偏移   /*   输入：      page-指向Page的指针      item-指向数据的指针      size-数据大小      offsetNumber-数据存储的偏移量，InvalidOffsetNumber表示不指定      flags-不"覆盖"原数据      is_heap-Heap数据    输出：      OffsetNumber-数据存储实际的偏移量    */    offnum = PageAddItem(pageHeader, (Item) tuple->t_data,                         tuple->t_len, InvalidOffsetNumber, false, true);    //如果不成功，记录日志    if (offnum == InvalidOffsetNumber)        elog(PANIC, "failed to add tuple to page");        /* Update tuple->t_self to the actual position where it was stored */    //&(tuple->t_self)类型为ItemPointer，亦即行指针（ItemPointerData结构体指针）    //根据buffer获取块号，把块号和行偏移写入行指针中    ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);    /*     * Insert the correct position into CTID of the stored tuple, too (unless     * this is a speculative insertion, in which case the token is held in     * CTID field instead)     */    if (!token)    {        //获取行指针，ItemId即ItemIdData指针        ItemId      itemId = PageGetItemId(pageHeader, offnum);        //获取TupleHeader        HeapTupleHeader item = (HeapTupleHeader) PageGetItem(pageHeader, itemId);        //更新TupleHeader中的行指针        item->t_ctid = tuple->t_self;    }}

三、跟踪分析

使用上一节的数据表，回收垃圾后，插入一条记录。

testdb=# vacuum t_insert;VACUUMtestdb=# testdb=# checkpoint;CHECKPOINTtestdb=#  select pg_backend_pid(); pg_backend_pid ----------------           1582(1 row)

使用gdb进行跟踪分析：

[root@localhost ~]# gdb -p 1582GNU gdb (GDB) Red Hat Enterprise Linux 7.6.1-100.el7...(gdb)

插入一条记录：

testdb=# -- 插入1行testdb=# insert into t_insert values(10,'10','10','10');(挂起）

回到gdb：

(gdb) b RelationPutHeapTupleBreakpoint 1 at 0x4cf492: file hio.c, line 51.#查看输入参数(gdb) p *relation$5 = {rd_node = {spcNode = 1663, dbNode = 16477, relNode = 26731}, rd_smgr = 0x259db68, rd_refcnt = 1, rd_backend = -1, rd_islocaltemp = false, rd_isnailed = false, rd_isvalid = true,   rd_indexvalid = 0 '\000', rd_statvalid = false, rd_createSubid = 0, rd_newRelfilenodeSubid = 0, rd_rel = 0x7fa9814589e8, rd_att = 0x7fa981458af8, rd_id = 26731, rd_lockInfo = {lockRelId = {      relId = 26731, dbId = 16477}}, rd_rules = 0x0, rd_rulescxt = 0x0, trigdesc = 0x0, rd_rsdesc = 0x0, rd_fkeylist = 0x0, rd_fkeyvalid = false, rd_partkeycxt = 0x0, rd_partkey = 0x0, rd_pdcxt = 0x0,   rd_partdesc = 0x0, rd_partcheck = 0x0, rd_indexlist = 0x0, rd_oidindex = 0, rd_pkindex = 0, rd_replidindex = 0, rd_statlist = 0x0, rd_indexattr = 0x0, rd_projindexattr = 0x0, rd_keyattr = 0x0,   rd_pkattr = 0x0, rd_idattr = 0x0, rd_projidx = 0x0, rd_pubactions = 0x0, rd_options = 0x0, rd_index = 0x0, rd_indextuple = 0x0, rd_amhandler = 0, rd_indexcxt = 0x0, rd_amroutine = 0x0,   rd_opfamily = 0x0, rd_opcintype = 0x0, rd_support = 0x0, rd_supportinfo = 0x0, rd_indoption = 0x0, rd_indexprs = 0x0, rd_indpred = 0x0, rd_exclops = 0x0, rd_exclprocs = 0x0, rd_exclstrats = 0x0,   rd_amcache = 0x0, rd_indcollation = 0x0, rd_fdwroutine = 0x0, rd_toastoid = 0, pgstat_info = 0x2591850}(gdb) p buffer$6 = 95(gdb) p tuple$7 = (HeapTuple) 0x2539a20(gdb) p *tuple  #注：HeapTuple$8 = {t_len = 61, t_self = {ip_blkid = {bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, t_tableOid = 26731, t_data = 0x2539a38}(gdb) p *tuple->t_data #注：HeapTupleHeader$9 = {t_choice = {t_heap = {t_xmin = 1612851, t_xmax = 0, t_field3 = {t_cid = 0, t_xvac = 0}}, t_datum = {datum_len_ = 1612851, datum_typmod = 0, datum_typeid = 0}}, t_ctid = {ip_blkid = {      bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, t_infomask2 = 4, t_infomask = 2050, t_hoff = 24 '\030', t_bits = 0x2539a4f ""}(gdb) p token$10 = false#查看PageHeader信息(gdb) p *(PageHeader)pageHeader$11 = {pd_lsn = {xlogid = 1, xrecoff = 3677464616}, pd_checksum = 0, pd_flags = 5, pd_lower = 60, pd_upper = 7680, pd_special = 8192, pd_pagesize_version = 8196, pd_prune_xid = 0,   pd_linp = 0x7fa96957d318}#调用PageAddItem函数后(gdb) next56      if (offnum == InvalidOffsetNumber)(gdb) p offnum #2号Item被删除，在执行vacuum回收后，已可用$12 = 2(gdb) p *itemId$13 = {lp_off = 7616, lp_flags = 1, lp_len = 61}(gdb) p *item$14 = {t_choice = {t_heap = {t_xmin = 1612851, t_xmax = 0, t_field3 = {t_cid = 0, t_xvac = 0}}, t_datum = {datum_len_ = 1612851, datum_typmod = 0, datum_typeid = 0}}, t_ctid = {ip_blkid = {      bi_hi = 65535, bi_lo = 65535}, ip_posid = 0}, t_infomask2 = 4, t_infomask = 2050, t_hoff = 24 '\030', t_bits = 0x7fa96957f0d7 ""}(gdb) next74  }(gdb) p *itemNo symbol "item" in current context.(gdb) p tuple->t_self$15 = {ip_blkid = {bi_hi = 0, bi_lo = 0}, ip_posid = 2} #0号Block，2号偏移(gdb) cContinuing.

可以看到，这行数据"正确"的插入在0号Block，2号偏移的位置上。