您好,登錄后才能下訂單哦!
這篇文章主要介紹“PostgreSQL中ExecHashJoin依賴其他函數的實現邏輯分析”,在日常操作中,相信很多人在PostgreSQL中ExecHashJoin依賴其他函數的實現邏輯分析問題上存在疑惑,小編查閱了各式資料,整理出簡單好用的操作方法,希望對大家解答”PostgreSQL中ExecHashJoin依賴其他函數的實現邏輯分析”的疑惑有所幫助!接下來,請跟著小編一起來學習吧!
這些函數在HJ_NEED_NEW_OUTER階段中使用,包括ExecHashJoinOuterGetTuple、ExecPrepHashTableForUnmatched、ExecHashGetBucketAndBatch、ExecHashGetSkewBucket、ExecHashJoinSaveTuple和ExecFetchSlotMinimalTuple等。
Plan
所有計劃節點通過將Plan結構作為第一個字段從Plan結構“派生”。這確保了在將節點轉換為計劃節點時,一切都能正常工作。(在執行器中以通用方式傳遞時,節點指針經常被轉換為Plan *)
/* ---------------- * Plan node * * All plan nodes "derive" from the Plan structure by having the * Plan structure as the first field. This ensures that everything works * when nodes are cast to Plan's. (node pointers are frequently cast to Plan* * when passed around generically in the executor) * 所有計劃節點通過將Plan結構作為第一個字段從Plan結構“派生”。 * 這確保了在將節點轉換為計劃節點時,一切都能正常工作。 * (在執行器中以通用方式傳遞時,節點指針經常被轉換為Plan *) * * We never actually instantiate any Plan nodes; this is just the common * abstract superclass for all Plan-type nodes. * 從未實例化任何Plan節點;這只是所有Plan-type節點的通用抽象超類。 * ---------------- */ typedef struct Plan { NodeTag type;//節點類型 /* * 成本估算信息;estimated execution costs for plan (see costsize.c for more info) */ Cost startup_cost; /* 啟動成本;cost expended before fetching any tuples */ Cost total_cost; /* 總成本;total cost (assuming all tuples fetched) */ /* * 優化器估算信息;planner's estimate of result size of this plan step */ double plan_rows; /* 行數;number of rows plan is expected to emit */ int plan_width; /* 平均行大小(Byte為單位);average row width in bytes */ /* * 并行執行相關的信息;information needed for parallel query */ bool parallel_aware; /* 是否參與并行執行邏輯?engage parallel-aware logic? */ bool parallel_safe; /* 是否并行安全;OK to use as part of parallel plan? */ /* * Plan類型節點通用的信息.Common structural data for all Plan types. */ int plan_node_id; /* unique across entire final plan tree */ List *targetlist; /* target list to be computed at this node */ List *qual; /* implicitly-ANDed qual conditions */ struct Plan *lefttree; /* input plan tree(s) */ struct Plan *righttree; List *initPlan; /* Init Plan nodes (un-correlated expr * subselects) */ /* * Information for management of parameter-change-driven rescanning * parameter-change-driven重掃描的管理信息. * * extParam includes the paramIDs of all external PARAM_EXEC params * affecting this plan node or its children. setParam params from the * node's initPlans are not included, but their extParams are. * * allParam includes all the extParam paramIDs, plus the IDs of local * params that affect the node (i.e., the setParams of its initplans). * These are _all_ the PARAM_EXEC params that affect this node. */ Bitmapset *extParam; Bitmapset *allParam; } Plan;
JoinState
Hash/NestLoop/Merge Join的基類
/* ---------------- * JoinState information * * Superclass for state nodes of join plans. * Hash/NestLoop/Merge Join的基類 * ---------------- */ typedef struct JoinState { PlanState ps;//基類PlanState JoinType jointype;//連接類型 //在找到一個匹配inner tuple的時候,如需要跳轉到下一個outer tuple,則該值為T bool single_match; /* True if we should skip to next outer tuple * after finding one inner match */ //連接條件表達式(除了ps.qual) ExprState *joinqual; /* JOIN quals (in addition to ps.qual) */ } JoinState;
HashJoinState
Hash Join運行期狀態結構體
/* these structs are defined in executor/hashjoin.h: */ typedef struct HashJoinTupleData *HashJoinTuple; typedef struct HashJoinTableData *HashJoinTable; typedef struct HashJoinState { JoinState js; /* 基類;its first field is NodeTag */ ExprState *hashclauses;//hash連接條件 List *hj_OuterHashKeys; /* 外表條件鏈表;list of ExprState nodes */ List *hj_InnerHashKeys; /* 內表連接條件;list of ExprState nodes */ List *hj_HashOperators; /* 操作符OIDs鏈表;list of operator OIDs */ HashJoinTable hj_HashTable;//Hash表 uint32 hj_CurHashValue;//當前的Hash值 int hj_CurBucketNo;//當前的bucket編號 int hj_CurSkewBucketNo;//行傾斜bucket編號 HashJoinTuple hj_CurTuple;//當前元組 TupleTableSlot *hj_OuterTupleSlot;//outer relation slot TupleTableSlot *hj_HashTupleSlot;//Hash tuple slot TupleTableSlot *hj_NullOuterTupleSlot;//用于外連接的outer虛擬slot TupleTableSlot *hj_NullInnerTupleSlot;//用于外連接的inner虛擬slot TupleTableSlot *hj_FirstOuterTupleSlot;// int hj_JoinState;//JoinState狀態 bool hj_MatchedOuter;//是否匹配 bool hj_OuterNotEmpty;//outer relation是否為空 } HashJoinState;
HashJoinTable
Hash表數據結構
typedef struct HashJoinTableData { int nbuckets; /* 內存中的hash桶數;# buckets in the in-memory hash table */ int log2_nbuckets; /* 2的對數(nbuckets必須是2的冪);its log2 (nbuckets must be a power of 2) */ int nbuckets_original; /* 首次hash時的桶數;# buckets when starting the first hash */ int nbuckets_optimal; /* 優化后的桶數(每個批次);optimal # buckets (per batch) */ int log2_nbuckets_optimal; /* 2的對數;log2(nbuckets_optimal) */ /* buckets[i] is head of list of tuples in i'th in-memory bucket */ //bucket [i]是內存中第i個桶中的元組鏈表的head item union { /* unshared array is per-batch storage, as are all the tuples */ //未共享數組是按批處理存儲的,所有元組均如此 struct HashJoinTupleData **unshared; /* shared array is per-query DSA area, as are all the tuples */ //共享數組是每個查詢的DSA區域,所有元組均如此 dsa_pointer_atomic *shared; } buckets; bool keepNulls; /*如不匹配則存儲NULL元組,該值為T;true to store unmatchable NULL tuples */ bool skewEnabled; /*是否使用傾斜優化?;are we using skew optimization? */ HashSkewBucket **skewBucket; /* 傾斜的hash表桶數;hashtable of skew buckets */ int skewBucketLen; /* skewBucket數組大小;size of skewBucket array (a power of 2!) */ int nSkewBuckets; /* 活動的傾斜桶數;number of active skew buckets */ int *skewBucketNums; /* 活動傾斜桶數組索引;array indexes of active skew buckets */ int nbatch; /* 批次數;number of batches */ int curbatch; /* 當前批次,第一輪為0;current batch #; 0 during 1st pass */ int nbatch_original; /* 在開始inner掃描時的批次;nbatch when we started inner scan */ int nbatch_outstart; /* 在開始outer掃描時的批次;nbatch when we started outer scan */ bool growEnabled; /* 關閉nbatch增加的標記;flag to shut off nbatch increases */ double totalTuples; /* 從inner plan獲得的元組數;# tuples obtained from inner plan */ double partialTuples; /* 通過hashjoin獲得的inner元組數;# tuples obtained from inner plan by me */ double skewTuples; /* 傾斜元組數;# tuples inserted into skew tuples */ /* * These arrays are allocated for the life of the hash join, but only if * nbatch > 1. A file is opened only when we first write a tuple into it * (otherwise its pointer remains NULL). Note that the zero'th array * elements never get used, since we will process rather than dump out any * tuples of batch zero. * 這些數組在散列連接的生命周期內分配,但僅當nbatch > 1時分配。 * 只有當第一次將元組寫入文件時,文件才會打開(否則它的指針將保持NULL)。 * 注意,第0個數組元素永遠不會被使用,因為批次0的元組永遠不會轉儲. */ BufFile **innerBatchFile; /* 每個批次的inner虛擬臨時文件緩存;buffered virtual temp file per batch */ BufFile **outerBatchFile; /* 每個批次的outer虛擬臨時文件緩存;buffered virtual temp file per batch */ /* * Info about the datatype-specific hash functions for the datatypes being * hashed. These are arrays of the same length as the number of hash join * clauses (hash keys). * 有關正在散列的數據類型的特定于數據類型的散列函數的信息。 * 這些數組的長度與散列連接子句(散列鍵)的數量相同。 */ FmgrInfo *outer_hashfunctions; /* outer hash函數FmgrInfo結構體;lookup data for hash functions */ FmgrInfo *inner_hashfunctions; /* inner hash函數FmgrInfo結構體;lookup data for hash functions */ bool *hashStrict; /* 每個hash操作符是嚴格?is each hash join operator strict? */ Size spaceUsed; /* 元組使用的當前內存空間大小;memory space currently used by tuples */ Size spaceAllowed; /* 空間使用上限;upper limit for space used */ Size spacePeak; /* 峰值的空間使用;peak space used */ Size spaceUsedSkew; /* 傾斜哈希表的當前空間使用情況;skew hash table's current space usage */ Size spaceAllowedSkew; /* 傾斜哈希表的使用上限;upper limit for skew hashtable */ MemoryContext hashCxt; /* 整個散列連接存儲的上下文;context for whole-hash-join storage */ MemoryContext batchCxt; /* 該批次存儲的上下文;context for this-batch-only storage */ /* used for dense allocation of tuples (into linked chunks) */ //用于密集分配元組(到鏈接塊中) HashMemoryChunk chunks; /* 整個批次使用一個鏈表;one list for the whole batch */ /* Shared and private state for Parallel Hash. */ //并行hash使用的共享和私有狀態 HashMemoryChunk current_chunk; /* 后臺進程的當前chunk;this backend's current chunk */ dsa_area *area; /* 用于分配內存的DSA區域;DSA area to allocate memory from */ ParallelHashJoinState *parallel_state;//并行執行狀態 ParallelHashJoinBatchAccessor *batches;//并行訪問器 dsa_pointer current_chunk_shared;//當前chunk的開始指針 } HashJoinTableData; typedef struct HashJoinTableData *HashJoinTable;
HashJoinTupleData
Hash連接元組數據
/* ---------------------------------------------------------------- * hash-join hash table structures * * Each active hashjoin has a HashJoinTable control block, which is * palloc'd in the executor's per-query context. All other storage needed * for the hashjoin is kept in private memory contexts, two for each hashjoin. * This makes it easy and fast to release the storage when we don't need it * anymore. (Exception: data associated with the temp files lives in the * per-query context too, since we always call buffile.c in that context.) * 每個活動的hashjoin都有一個可散列的控制塊,它在執行程序的每個查詢上下文中都是通過palloc分配的。 * hashjoin所需的所有其他存儲都保存在私有內存上下文中,每個hashjoin有兩個。 * 當不再需要它的時候,這使得釋放它變得簡單和快速。 * (例外:與臨時文件相關的數據也存在于每個查詢上下文中,因為在這種情況下總是調用buffile.c。) * * The hashtable contexts are made children of the per-query context, ensuring * that they will be discarded at end of statement even if the join is * aborted early by an error. (Likewise, any temporary files we make will * be cleaned up by the virtual file manager in event of an error.) * hashtable上下文是每個查詢上下文的子上下文,確保在語句結束時丟棄它們,即使連接因錯誤而提前中止。 * (同樣,如果出現錯誤,虛擬文件管理器將清理創建的任何臨時文件。) * * Storage that should live through the entire join is allocated from the * "hashCxt", while storage that is only wanted for the current batch is * allocated in the "batchCxt". By resetting the batchCxt at the end of * each batch, we free all the per-batch storage reliably and without tedium. * 通過整個連接的存儲空間應從“hashCxt”分配,而只需要當前批處理的存儲空間在“batchCxt”中分配。 * 通過在每個批處理結束時重置batchCxt,可以可靠地釋放每個批處理的所有存儲,而不會感到單調乏味。 * * During first scan of inner relation, we get its tuples from executor. * If nbatch > 1 then tuples that don't belong in first batch get saved * into inner-batch temp files. The same statements apply for the * first scan of the outer relation, except we write tuples to outer-batch * temp files. After finishing the first scan, we do the following for * each remaining batch: * 1. Read tuples from inner batch file, load into hash buckets. * 2. Read tuples from outer batch file, match to hash buckets and output. * 在內部關系的第一次掃描中,從執行者那里得到了它的元組。 * 如果nbatch > 1,那么不屬于第一批的元組將保存到批內臨時文件中。 * 相同的語句適用于外關系的第一次掃描,但是我們將元組寫入外部批處理臨時文件。 * 完成第一次掃描后,我們對每批剩余的元組做如下處理: * 1.從內部批處理文件讀取元組,加載到散列桶中。 * 2.從外部批處理文件讀取元組,匹配哈希桶和輸出。 * * It is possible to increase nbatch on the fly if the in-memory hash table * gets too big. The hash-value-to-batch computation is arranged so that this * can only cause a tuple to go into a later batch than previously thought, * never into an earlier batch. When we increase nbatch, we rescan the hash * table and dump out any tuples that are now of a later batch to the correct * inner batch file. Subsequently, while reading either inner or outer batch * files, we might find tuples that no longer belong to the current batch; * if so, we just dump them out to the correct batch file. * 如果內存中的哈希表太大,可以動態增加nbatch。 * 散列值到批處理的計算是這樣安排的: * 這只會導致元組進入比以前認為的更晚的批處理,而不會進入更早的批處理。 * 當增加nbatch時,重新掃描哈希表,并將現在屬于后面批處理的任何元組轉儲到正確的內部批處理文件。 * 隨后,在讀取內部或外部批處理文件時,可能會發現不再屬于當前批處理的元組; * 如果是這樣,只需將它們轉儲到正確的批處理文件即可。 * ---------------------------------------------------------------- */ /* these are in nodes/execnodes.h: */ /* typedef struct HashJoinTupleData *HashJoinTuple; */ /* typedef struct HashJoinTableData *HashJoinTable; */ typedef struct HashJoinTupleData { /* link to next tuple in same bucket */ //link同一個桶中的下一個元組 union { struct HashJoinTupleData *unshared; dsa_pointer shared; } next; uint32 hashvalue; /* 元組的hash值;tuple's hash code */ /* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */ } HashJoinTupleData; #define HJTUPLE_OVERHEAD MAXALIGN(sizeof(HashJoinTupleData)) #define HJTUPLE_MINTUPLE(hjtup) \ ((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
ExecHashJoinOuterGetTuple
獲取非并行模式下hashjoin的下一個外部元組:要么在第一次執行外部plan節點,要么從hashjoin批處理的臨時文件中獲取。
/*---------------------------------------------------------------------------------------------------- HJ_NEED_NEW_OUTER 階段 ----------------------------------------------------------------------------------------------------*/ /* * ExecHashJoinOuterGetTuple * * get the next outer tuple for a parallel oblivious hashjoin: either by * executing the outer plan node in the first pass, or from the temp * files for the hashjoin batches. * 獲取非并行模式下hashjoin的下一個外部元組:要么在第一次執行外部plan節點,要么從hashjoin批處理的臨時文件中獲取。 * * Returns a null slot if no more outer tuples (within the current batch). * 如果沒有更多外部元組(在當前批處理中),則返回空slot。 * * On success, the tuple's hash value is stored at *hashvalue --- this is * either originally computed, or re-read from the temp file. * 如果成功,tuple的散列值存儲在輸入參數*hashvalue中——這是最初計算的,或者是從臨時文件中重新讀取的。 */ static TupleTableSlot * ExecHashJoinOuterGetTuple(PlanState *outerNode,//outer 節點 HashJoinState *hjstate,//Hash Join執行狀態 uint32 *hashvalue)//Hash值 { HashJoinTable hashtable = hjstate->hj_HashTable;//hash表 int curbatch = hashtable->curbatch;//當前批次 TupleTableSlot *slot;//返回的slot if (curbatch == 0) /* 第一個批次;if it is the first pass */ { /* * Check to see if first outer tuple was already fetched by * ExecHashJoin() and not used yet. * 檢查第一個外部元組是否已經由ExecHashJoin()函數獲取且尚未使用。 */ slot = hjstate->hj_FirstOuterTupleSlot; if (!TupIsNull(slot)) hjstate->hj_FirstOuterTupleSlot = NULL;//重置slot else slot = ExecProcNode(outerNode);//如為NULL,則獲取slot while (!TupIsNull(slot))//slot不為NULL { /* * We have to compute the tuple's hash value. * 計算hash值 */ ExprContext *econtext = hjstate->js.ps.ps_ExprContext;//表達式計算上下文 econtext->ecxt_outertuple = slot;//存儲獲取的slot if (ExecHashGetHashValue(hashtable, econtext, hjstate->hj_OuterHashKeys, true, /* outer tuple */ HJ_FILL_OUTER(hjstate), hashvalue))//計算Hash值 { /* remember outer relation is not empty for possible rescan */ hjstate->hj_OuterNotEmpty = true;//設置標記(outer不為空) return slot;//返回匹配的slot } /* * That tuple couldn't match because of a NULL, so discard it and * continue with the next one. * 該元組無法匹配,丟棄它,繼續下一個元組。 */ slot = ExecProcNode(outerNode);//繼續獲取下一個 } } else if (curbatch < hashtable->nbatch)//不是第一個批次 { BufFile *file = hashtable->outerBatchFile[curbatch];//獲取緩沖的文件 /* * In outer-join cases, we could get here even though the batch file * is empty. * 在外連接的情況下,即使批處理文件是空的,也可以在這里進行處理。 */ if (file == NULL) return NULL;//如文件為NULL,則返回 slot = ExecHashJoinGetSavedTuple(hjstate, file, hashvalue, hjstate->hj_OuterTupleSlot);//從文件中獲取slot if (!TupIsNull(slot)) return slot;//非NULL,則返回 } /* End of this batch */ //已完成,則返回NULL return NULL; } /* * ExecHashGetHashValue * Compute the hash value for a tuple * ExecHashGetHashValue - 計算元組的Hash值 * * The tuple to be tested must be in either econtext->ecxt_outertuple or * econtext->ecxt_innertuple. Vars in the hashkeys expressions should have * varno either OUTER_VAR or INNER_VAR. * 要測試的元組必須位于econtext->ecxt_outertuple或econtext->ecxt_innertuple中。 * hashkeys表達式中的Vars應該具有varno,即OUTER_VAR或INNER_VAR。 * * A true result means the tuple's hash value has been successfully computed * and stored at *hashvalue. A false result means the tuple cannot match * because it contains a null attribute, and hence it should be discarded * immediately. (If keep_nulls is true then false is never returned.) * T意味著tuple的散列值已經成功計算并存儲在*hashvalue參數中。 * F意味著元組不能匹配,因為它包含null屬性,因此應該立即丟棄它。 * (如果keep_nulls為真,則永遠不會返回F。) */ bool ExecHashGetHashValue(HashJoinTable hashtable,//Hash表 ExprContext *econtext,//上下文 List *hashkeys,//Hash鍵值鏈表 bool outer_tuple,//是否外表元組 bool keep_nulls,//是否保存NULL uint32 *hashvalue)//返回的Hash值 { uint32 hashkey = 0;//hash鍵 FmgrInfo *hashfunctions;//hash函數 ListCell *hk;//臨時變量 int i = 0; MemoryContext oldContext; /* * We reset the eval context each time to reclaim any memory leaked in the * hashkey expressions. * 我們每次重置eval上下文來回收hashkey表達式中分配的內存。 */ ResetExprContext(econtext); //切換上下文 oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); if (outer_tuple) hashfunctions = hashtable->outer_hashfunctions;//外表元組 else hashfunctions = hashtable->inner_hashfunctions;//內表元組 foreach(hk, hashkeys)//遍歷Hash鍵值 { ExprState *keyexpr = (ExprState *) lfirst(hk);//鍵值表達式 Datum keyval; bool isNull; /* rotate hashkey left 1 bit at each step */ //哈希鍵左移1位 hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); /* * Get the join attribute value of the tuple * 獲取元組的連接屬性值 */ keyval = ExecEvalExpr(keyexpr, econtext, &isNull); /* * If the attribute is NULL, and the join operator is strict, then * this tuple cannot pass the join qual so we can reject it * immediately (unless we're scanning the outside of an outer join, in * which case we must not reject it). Otherwise we act like the * hashcode of NULL is zero (this will support operators that act like * IS NOT DISTINCT, though not any more-random behavior). We treat * the hash support function as strict even if the operator is not. * 如果屬性為NULL,并且join操作符是嚴格的,那么這個元組不能傳遞連接條件join qual, * 因此可以立即拒絕它(除非正在掃描外連接的外表,在這種情況下不能拒絕它)。 * 否則,我們的行為就好像NULL的哈希碼是零一樣(這將支持IS NOT DISTINCT操作符,但不會有任何隨機的情況出現)。 * 即使操作符不是嚴格的,也將哈希函數視為嚴格的。 * * Note: currently, all hashjoinable operators must be strict since * the hash index AM assumes that. However, it takes so little extra * code here to allow non-strict that we may as well do it. * 注意:目前,所有哈希可連接操作符都必須嚴格,因為哈希索引AM假定如此。 * 但是,這里只需要很少的額外代碼就可以實現非嚴格性,我們也可以這樣做。 */ if (isNull) { //NULL值 if (hashtable->hashStrict[i] && !keep_nulls) { MemoryContextSwitchTo(oldContext); //不保持NULL值,不匹配 return false; /* cannot match */ } /* else, leave hashkey unmodified, equivalent to hashcode 0 */ //否則的話,不修改hashkey,仍為0 } else { //不為NULL /* Compute the hash function */ //計算hash值 uint32 hkey; hkey = DatumGetUInt32(FunctionCall1(&hashfunctions[i], keyval)); hashkey ^= hkey; } i++;//下一個鍵 } //切換上下文 MemoryContextSwitchTo(oldContext); //返回Hash鍵值 *hashvalue = hashkey; return true;//成功獲取 }
ExecPrepHashTableForUnmatched
為ExecScanHashTableForUnmatched函數調用作準備
/* * ExecPrepHashTableForUnmatched * set up for a series of ExecScanHashTableForUnmatched calls * 為ExecScanHashTableForUnmatched函數調用作準備 */ void ExecPrepHashTableForUnmatched(HashJoinState *hjstate) { /*---------- * During this scan we use the HashJoinState fields as follows: * * hj_CurBucketNo: next regular bucket to scan * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums) * hj_CurTuple: last tuple returned, or NULL to start next bucket * 在這次掃描期間,我們使用HashJoinState結構體中的字段如下: * hj_CurBucketNo: 下一個常規的bucket * hj_CurSkewBucketNo: 下一個個傾斜的bucket * hj_CurTuple: 最后返回的元組,或者為NULL(下一個bucket開始) *---------- */ hjstate->hj_CurBucketNo = 0; hjstate->hj_CurSkewBucketNo = 0; hjstate->hj_CurTuple = NULL; }
ExecHashGetBucketAndBatch
確定哈希值的bucket號和批處理號
/* * ExecHashGetBucketAndBatch * Determine the bucket number and batch number for a hash value * ExecHashGetBucketAndBatch * 確定哈希值的bucket號和批處理號 * * Note: on-the-fly increases of nbatch must not change the bucket number * for a given hash code (since we don't move tuples to different hash * chains), and must only cause the batch number to remain the same or * increase. Our algorithm is * bucketno = hashvalue MOD nbuckets * batchno = (hashvalue DIV nbuckets) MOD nbatch * where nbuckets and nbatch are both expected to be powers of 2, so we can * do the computations by shifting and masking. (This assumes that all hash * functions are good about randomizing all their output bits, else we are * likely to have very skewed bucket or batch occupancy.) * 注意:nbatch的動態增加不能更改給定哈希碼的桶號(因為我們不將元組移動到不同的哈希鏈), * 并且只能使批號保持不變或增加。我們的算法是: * bucketno = hashvalue MOD nbuckets * batchno = (hashvalue DIV nbuckets) MOD nbatch * 這里nbucket和nbatch都是2的冪,所以我們可以通過移動和屏蔽來進行計算。 * (這假定所有哈希函數都能很好地隨機化它們的所有輸出位,否則很可能會出現非常傾斜的桶或批處理占用。) * * nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic * bucket count growth. Once we start batching, the value is fixed and does * not change over the course of the join (making it possible to compute batch * number the way we do here). * 當nbatch == 1時,由于動態bucket計數的增長,nbucket和log2_nbucket可能會發生變化。 * 一旦開始批處理,這個值就固定了,并且在連接過程中不會改變(這使得我們可以像這里那樣計算批號)。 * * nbatch is always a power of 2; we increase it only by doubling it. This * effectively adds one more bit to the top of the batchno. * nbatch總是2的冪;我們只是通過x2來調整。這相當于為批號的頭部增加了一位。 */ void ExecHashGetBucketAndBatch(HashJoinTable hashtable, uint32 hashvalue, int *bucketno, int *batchno) { uint32 nbuckets = (uint32) hashtable->nbuckets;//桶數 uint32 nbatch = (uint32) hashtable->nbatch;//批次號 if (nbatch > 1)//批次>1 { /* we can do MOD by masking, DIV by shifting */ //我們可以通過屏蔽來實現MOD,通過移動來實現DIV *bucketno = hashvalue & (nbuckets - 1);//nbuckets - 1后相當于N個1 *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1); } else { *bucketno = hashvalue & (nbuckets - 1);//只有一個批次,簡單處理即可 *batchno = 0; } }
ExecHashGetSkewBucket
返回這個哈希值的傾斜桶的索引,如果哈希值與任何活動的傾斜桶沒有關聯,則返回INVALID_SKEW_BUCKET_NO。
/* * ExecHashGetSkewBucket * * Returns the index of the skew bucket for this hashvalue, * or INVALID_SKEW_BUCKET_NO if the hashvalue is not * associated with any active skew bucket. * 返回這個哈希值的傾斜桶的索引,如果哈希值與任何活動的傾斜桶沒有關聯,則返回INVALID_SKEW_BUCKET_NO。 */ int ExecHashGetSkewBucket(HashJoinTable hashtable, uint32 hashvalue) { int bucket; /* * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in * particular, this happens after the initial batch is done). * 如果不進行傾斜優化(特別是在初始批處理完成之后),則返回INVALID_SKEW_BUCKET_NO。 */ if (!hashtable->skewEnabled) return INVALID_SKEW_BUCKET_NO; /* * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.' * 由于skewBucketLen是2的冪,可以通過AND操作來做一個模。 */ bucket = hashvalue & (hashtable->skewBucketLen - 1); /* * While we have not hit a hole in the hashtable and have not hit the * desired bucket, we have collided with some other hash value, so try the * next bucket location. * 雖然我們沒有在哈希表中找到一個hole,也沒有找到所需的bucket, * 但是與其他一些哈希值發生了沖突,所以嘗試下一個bucket位置。 */ while (hashtable->skewBucket[bucket] != NULL && hashtable->skewBucket[bucket]->hashvalue != hashvalue) bucket = (bucket + 1) & (hashtable->skewBucketLen - 1); /* * Found the desired bucket? * 找到了bucket,返回 */ if (hashtable->skewBucket[bucket] != NULL) return bucket; /* * There must not be any hashtable entry for this hash value. */ //否則返回INVALID_SKEW_BUCKET_NO return INVALID_SKEW_BUCKET_NO; }
ExecHashJoinSaveTuple
在批處理文件中保存元組.每個元組在文件中記錄的是它的散列值,然后是最小化格式的元組。
/* * ExecHashJoinSaveTuple * save a tuple to a batch file. * 在批處理文件中保存元組 * * The data recorded in the file for each tuple is its hash value, * then the tuple in MinimalTuple format. * 每個元組在文件中記錄的是它的散列值,然后是最小化格式的元組。 * * Note: it is important always to call this in the regular executor * context, not in a shorter-lived context; else the temp file buffers * will get messed up. * 注意:在常規執行程序上下文中調用它總是很重要的,而不是在較短的生命周期中調用它; * 否則臨時文件緩沖區就會出現混亂。 */ void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, BufFile **fileptr) { BufFile *file = *fileptr;//文件指針 size_t written;//寫入大小 if (file == NULL) { /* First write to this batch file, so open it. */ //文件指針為NULL,首次寫入,則打開批處理文件 file = BufFileCreateTemp(false); *fileptr = file; } //首先寫入hash值,返回寫入的大小 written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32)); if (written != sizeof(uint32))//寫入有誤,報錯 ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to hash-join temporary file: %m"))); //寫入tuple written = BufFileWrite(file, (void *) tuple, tuple->t_len); if (written != tuple->t_len)//寫入有誤,報錯 ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to hash-join temporary file: %m"))); }
ExecFetchSlotMinimalTuple
以最小化物理元組的格式提取slot的數據
/* -------------------------------- * ExecFetchSlotMinimalTuple * Fetch the slot's minimal physical tuple. * 以最小化物理元組的格式提取slot的數據. * * If the given tuple table slot can hold a minimal tuple, indicated by a * non-NULL get_minimal_tuple callback, the function returns the minimal * tuple returned by that callback. It assumes that the minimal tuple * returned by the callback is "owned" by the slot i.e. the slot is * responsible for freeing the memory consumed by the tuple. Hence it sets * *shouldFree to false, indicating that the caller should not free the * memory consumed by the minimal tuple. In this case the returned minimal * tuple should be considered as read-only. * 如果給定的元組table slot可以保存由non-NULL get_minimal_tuple回調函數指示的最小元組, * 則函數將返回該回調函數返回的最小元組。 * 它假定回調函數返回的最小元組由slot“擁有”,即slot負責釋放元組所消耗的內存。 * 因此,它將*shouldFree設置為false,表示調用方不應該釋放內存。 * 在這種情況下,返回的最小元組應該被認為是只讀的。 * * If that callback is not supported, it calls copy_minimal_tuple callback * which is expected to return a copy of minimal tuple represnting the * contents of the slot. In this case *shouldFree is set to true, * indicating the caller that it should free the memory consumed by the * minimal tuple. In this case the returned minimal tuple may be written * up. * 如果不支持該回調函數,則調用copy_minimal_tuple回調函數, * 該回調將返回一個表示slot內容的最小元組副本。 * *shouldFree被設置為true,這表示調用者應該釋放內存。 * 在這種情況下,可以寫入返回的最小元組。 * -------------------------------- */ MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot, bool *shouldFree) { /* * sanity checks * 安全檢查 */ Assert(slot != NULL); Assert(!TTS_EMPTY(slot)); if (slot->tts_ops->get_minimal_tuple)//調用slot->tts_ops->get_minimal_tuple { //調用成功,則該元組為只讀,由slot負責釋放 if (shouldFree) *shouldFree = false; return slot->tts_ops->get_minimal_tuple(slot); } else { //調用不成功,設置為true,由調用方釋放 if (shouldFree) *shouldFree = true; return slot->tts_ops->copy_minimal_tuple(slot);//調用copy_minimal_tuple函數 } }
測試腳本如下
testdb=# set enable_nestloop=false; SET testdb=# set enable_mergejoin=false; SET testdb=# explain verbose select dw.*,grjf.grbh,grjf.xm,grjf.ny,grjf.je testdb-# from t_dwxx dw,lateral (select gr.grbh,gr.xm,jf.ny,jf.je testdb(# from t_grxx gr inner join t_jfxx jf testdb(# on gr.dwbh = dw.dwbh testdb(# and gr.grbh = jf.grbh) grjf testdb-# order by dw.dwbh; QUERY PLAN ----------------------------------------------------------------------------------------------- Sort (cost=14828.83..15078.46 rows=99850 width=47) Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je Sort Key: dw.dwbh -> Hash Join (cost=3176.00..6537.55 rows=99850 width=47) Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm, jf.ny, jf.je Hash Cond: ((gr.grbh)::text = (jf.grbh)::text) -> Hash Join (cost=289.00..2277.61 rows=99850 width=32) Output: dw.dwmc, dw.dwbh, dw.dwdz, gr.grbh, gr.xm Inner Unique: true Hash Cond: ((gr.dwbh)::text = (dw.dwbh)::text) -> Seq Scan on public.t_grxx gr (cost=0.00..1726.00 rows=100000 width=16) Output: gr.dwbh, gr.grbh, gr.xm, gr.xb, gr.nl -> Hash (cost=164.00..164.00 rows=10000 width=20) Output: dw.dwmc, dw.dwbh, dw.dwdz -> Seq Scan on public.t_dwxx dw (cost=0.00..164.00 rows=10000 width=20) Output: dw.dwmc, dw.dwbh, dw.dwdz -> Hash (cost=1637.00..1637.00 rows=100000 width=20) Output: jf.ny, jf.je, jf.grbh -> Seq Scan on public.t_jfxx jf (cost=0.00..1637.00 rows=100000 width=20) Output: jf.ny, jf.je, jf.grbh (20 rows)
啟動gdb,設置斷點
(gdb) b ExecHashJoinOuterGetTuple Breakpoint 1 at 0x702edc: file nodeHashjoin.c, line 807. (gdb) b ExecHashGetHashValue Breakpoint 2 at 0x6ff060: file nodeHash.c, line 1778. (gdb) b ExecHashGetBucketAndBatch Breakpoint 3 at 0x6ff1df: file nodeHash.c, line 1880. (gdb) b ExecHashJoinSaveTuple Breakpoint 4 at 0x703973: file nodeHashjoin.c, line 1214. (gdb)
ExecHashGetHashValue
ExecHashGetHashValue->進入函數ExecHashGetHashValue
(gdb) c Continuing. Breakpoint 2, ExecHashGetHashValue (hashtable=0x14acde8, econtext=0x149c3d0, hashkeys=0x14a8e40, outer_tuple=false, keep_nulls=false, hashvalue=0x7ffc7eba5c20) at nodeHash.c:1778 1778 uint32 hashkey = 0;
ExecHashGetHashValue->初始化,切換內存上下文
1778 uint32 hashkey = 0; (gdb) n 1781 int i = 0; (gdb) 1788 ResetExprContext(econtext); (gdb) 1790 oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); (gdb) 1792 if (outer_tuple)
ExecHashGetHashValue->inner hash函數
1792 if (outer_tuple) (gdb) 1795 hashfunctions = hashtable->inner_hashfunctions;
ExecHashGetHashValue->獲取hahs鍵信息
1號RTE(varnoold = 1,即t_dwxx)的dwbh字段(varattno = 2)
(gdb) 1797 foreach(hk, hashkeys) (gdb) 1799 ExprState *keyexpr = (ExprState *) lfirst(hk); (gdb) 1804 hashkey = (hashkey << 1) | ((hashkey & 0x80000000) ? 1 : 0); (gdb) p *keyexpr $1 = {tag = {type = T_ExprState}, flags = 2 '\002', resnull = false, resvalue = 0, resultslot = 0x0, steps = 0x14a8a00, evalfunc = 0x6d1a6e <ExecInterpExprStillValid>, expr = 0x1498fc0, evalfunc_private = 0x6d1e97 <ExecJustInnerVar>, steps_len = 3, steps_alloc = 16, parent = 0x149b738, ext_params = 0x0, innermost_caseval = 0x0, innermost_casenull = 0x0, innermost_domainval = 0x0, innermost_domainnull = 0x0} (gdb) p *(RelabelType *)keyexpr->expr $3 = {xpr = {type = T_RelabelType}, arg = 0x1499018, resulttype = 25, resulttypmod = -1, resultcollid = 100, relabelformat = COERCE_IMPLICIT_CAST, location = -1} (gdb) p *((RelabelType *)keyexpr->expr)->arg $4 = {type = T_Var} (gdb) p *(Var *)((RelabelType *)keyexpr->expr)->arg $5 = {xpr = {type = T_Var}, varno = 65000, varattno = 2, vartype = 1043, vartypmod = 24, varcollid = 100, varlevelsup = 0, varnoold = 1, varoattno = 2, location = 218} (gdb)
ExecHashGetHashValue->獲取hash值,解析表達式
(gdb) n 1809 keyval = ExecEvalExpr(keyexpr, econtext, &isNull); (gdb) 1824 if (isNull) (gdb) p hashkey $6 = 0 (gdb) p keyval $7 = 140460362257270 (gdb)
ExecHashGetHashValue->返回值不為NULL
(gdb) p isNull $8 = false (gdb) n 1838 hkey = DatumGetUInt32(FunctionCall1(&hashfunctions[i], keyval));
ExecHashGetHashValue->計算Hash值
(gdb) n 1839 hashkey ^= hkey; (gdb) p hkey $9 = 3663833849 (gdb) p hashkey $10 = 0 (gdb) n 1842 i++; (gdb) p hashkey $11 = 3663833849 (gdb)
ExecHashGetHashValue->返回計算結果
(gdb) n 1797 foreach(hk, hashkeys) (gdb) 1845 MemoryContextSwitchTo(oldContext); (gdb) 1847 *hashvalue = hashkey; (gdb) 1848 return true; (gdb) 1849 }
ExecHashGetBucketAndBatch
ExecHashGetBucketAndBatch->進入ExecHashGetBucketAndBatch
(gdb) c Continuing. Breakpoint 3, ExecHashGetBucketAndBatch (hashtable=0x14acde8, hashvalue=3663833849, bucketno=0x7ffc7eba5bdc, batchno=0x7ffc7eba5bd8) at nodeHash.c:1880 1880 uint32 nbuckets = (uint32) hashtable->nbuckets;
ExecHashGetBucketAndBatch->獲取bucket數和批次數
1880 uint32 nbuckets = (uint32) hashtable->nbuckets; (gdb) n 1881 uint32 nbatch = (uint32) hashtable->nbatch; (gdb) 1883 if (nbatch > 1) (gdb) p nbuckets $12 = 16384 (gdb) p nbatch $13 = 1 (gdb)
ExecHashGetBucketAndBatch->計算桶號和批次號(只有一個批次,設置為0)
(gdb) n 1891 *bucketno = hashvalue & (nbuckets - 1); (gdb) 1892 *batchno = 0; (gdb) 1894 } (gdb) p bucketno $14 = (int *) 0x7ffc7eba5bdc (gdb) p *bucketno $15 = 11001 (gdb)
ExecHashJoinOuterGetTuple
ExecHashJoinOuterGetTuple->進入ExecHashJoinOuterGetTuple函數
(gdb) info break Num Type Disp Enb Address What 1 breakpoint keep y 0x0000000000702edc in ExecHashJoinOuterGetTuple at nodeHashjoin.c:807 2 breakpoint keep y 0x00000000006ff060 in ExecHashGetHashValue at nodeHash.c:1778 breakpoint already hit 4 times 3 breakpoint keep y 0x00000000006ff1df in ExecHashGetBucketAndBatch at nodeHash.c:1880 breakpoint already hit 4 times 4 breakpoint keep y 0x0000000000703973 in ExecHashJoinSaveTuple at nodeHashjoin.c:1214 (gdb) del 2 (gdb) del 3 (gdb) c Continuing. Breakpoint 1, ExecHashJoinOuterGetTuple (outerNode=0x149ba10, hjstate=0x149b738, hashvalue=0x7ffc7eba5ccc) at nodeHashjoin.c:807 807 HashJoinTable hashtable = hjstate->hj_HashTable; (gdb)
ExecHashJoinOuterGetTuple->查看輸入參數
outerNode:outer relation為順序掃描得到的relation(對t_jfxx進行順序掃描)
hjstate:Hash Join執行狀態
hashvalue:Hash值
(gdb) p *outerNode $16 = {type = T_SeqScanState, plan = 0x1494d10, state = 0x149b0f8, ExecProcNode = 0x71578d <ExecSeqScan>, ExecProcNodeReal = 0x71578d <ExecSeqScan>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, lefttree = 0x0, righttree = 0x0, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, ps_ResultTupleSlot = 0x149c178, ps_ExprContext = 0x149bb28, ps_ProjInfo = 0x0, scandesc = 0x7fbfa69a8308} (gdb) p *hjstate $17 = {js = {ps = {type = T_HashJoinState, plan = 0x1496d18, state = 0x149b0f8, ExecProcNode = 0x70291d <ExecHashJoin>, ExecProcNodeReal = 0x70291d <ExecHashJoin>, instrument = 0x0, worker_instrument = 0x0, worker_jit_instrument = 0x0, qual = 0x0, lefttree = 0x149ba10, righttree = 0x149c2b8, initPlan = 0x0, subPlan = 0x0, chgParam = 0x0, ps_ResultTupleSlot = 0x14a7498, ps_ExprContext = 0x149b950, ps_ProjInfo = 0x149cef0, scandesc = 0x0}, jointype = JOIN_INNER, single_match = true, joinqual = 0x0}, hashclauses = 0x14a7b30, hj_OuterHashKeys = 0x14a8930, hj_InnerHashKeys = 0x14a8e40, hj_HashOperators = 0x14a8ea0, hj_HashTable = 0x14acde8, hj_CurHashValue = 0, hj_CurBucketNo = 0, hj_CurSkewBucketNo = -1, hj_CurTuple = 0x0, hj_OuterTupleSlot = 0x14a79f0, hj_HashTupleSlot = 0x149cc18, hj_NullOuterTupleSlot = 0x0, hj_NullInnerTupleSlot = 0x0, hj_FirstOuterTupleSlot = 0x149bbe8, hj_JoinState = 2, hj_MatchedOuter = false, hj_OuterNotEmpty = false} (gdb) p *hashvalue $18 = 32703 (gdb)
ExecHashJoinOuterGetTuple->只有一個批次,批次號為0
(gdb) n 808 int curbatch = hashtable->curbatch; (gdb) 811 if (curbatch == 0) /* if it is the first pass */ (gdb) p curbatch $20 = 0
ExecHashJoinOuterGetTuple->獲取首個outer tuple slot(不為NULL),重置hjstate->hj_FirstOuterTupleSlot為NULL
(gdb) n 817 slot = hjstate->hj_FirstOuterTupleSlot; (gdb) 818 if (!TupIsNull(slot)) (gdb) p *slot $21 = {type = T_TupleTableSlot, tts_isempty = false, tts_shouldFree = false, tts_shouldFreeMin = false, tts_slow = false, tts_tuple = 0x14ac200, tts_tupleDescriptor = 0x7fbfa69a8308, tts_mcxt = 0x149afe0, tts_buffer = 345, tts_nvalid = 0, tts_values = 0x149bc48, tts_isnull = 0x149bc70, tts_mintuple = 0x0, tts_minhdr = {t_len = 0, t_self = {ip_blkid = { bi_hi = 0, bi_lo = 0}, ip_posid = 0}, t_tableOid = 0, t_data = 0x0}, tts_off = 0, tts_fixedTupleDescriptor = true} (gdb) (gdb) n 819 hjstate->hj_FirstOuterTupleSlot = NULL; (gdb)
ExecHashJoinOuterGetTuple->循環獲取,找到匹配的slot
(gdb) 823 while (!TupIsNull(slot)) (gdb) n 828 ExprContext *econtext = hjstate->js.ps.ps_ExprContext; (gdb)
ExecHashJoinOuterGetTuple->成功匹配,返回slot
(gdb) n 830 econtext->ecxt_outertuple = slot; (gdb) 834 HJ_FILL_OUTER(hjstate), (gdb) 831 if (ExecHashGetHashValue(hashtable, econtext, (gdb) 838 hjstate->hj_OuterNotEmpty = true; (gdb) 840 return slot; (gdb) p *slot $22 = {type = T_TupleTableSlot, tts_isempty = false, tts_shouldFree = false, tts_shouldFreeMin = false, tts_slow = true, tts_tuple = 0x14ac200, tts_tupleDescriptor = 0x7fbfa69a8308, tts_mcxt = 0x149afe0, tts_buffer = 345, tts_nvalid = 1, tts_values = 0x149bc48, tts_isnull = 0x149bc70, tts_mintuple = 0x0, tts_minhdr = {t_len = 0, t_self = {ip_blkid = { bi_hi = 0, bi_lo = 0}, ip_posid = 0}, t_tableOid = 0, t_data = 0x0}, tts_off = 2, tts_fixedTupleDescriptor = true} (gdb)
到此,關于“PostgreSQL中ExecHashJoin依賴其他函數的實現邏輯分析”的學習就結束了,希望能夠解決大家的疑惑。理論與實踐的搭配能更好的幫助大家學習,快去試試吧!若想繼續學習更多相關知識,請繼續關注億速云網站,小編會繼續努力為大家帶來更多實用的文章!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。