@@ -5880,23 +5880,245 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
5880
5880
}
5881
5881
5882
5882
/*
5883
- * heap_inplace_update - update a tuple "in place" (ie, overwrite it)
5884
- *
5885
- * Overwriting violates both MVCC and transactional safety, so the uses
5886
- * of this function in Postgres are extremely limited. Nonetheless we
5887
- * find some places to use it.
5888
- *
5889
- * The tuple cannot change size, and therefore it's reasonable to assume
5890
- * that its null bitmap (if any) doesn't change either. So we just
5891
- * overwrite the data portion of the tuple without touching the null
5892
- * bitmap or any of the header fields.
5883
+ * heap_inplace_lock - protect inplace update from concurrent heap_update()
5884
+ *
5885
+ * Evaluate whether the tuple's state is compatible with a no-key update.
5886
+ * Current transaction rowmarks are fine, as is KEY SHARE from any
5887
+ * transaction. If compatible, return true with the buffer exclusive-locked,
5888
+ * and the caller must release that by calling
5889
+ * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
5890
+ * an error. Otherwise, return false after blocking transactions, if any,
5891
+ * have ended.
5892
+ *
5893
+ * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
5894
+ * DDL, this doesn't guarantee any particular predicate locking.
5895
+ *
5896
+ * One could modify this to return true for tuples with delete in progress,
5897
+ * All inplace updaters take a lock that conflicts with DROP. If explicit
5898
+ * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an
5899
+ * update.
5900
+ *
5901
+ * Readers of inplace-updated fields expect changes to those fields are
5902
+ * durable. For example, vac_truncate_clog() reads datfrozenxid from
5903
+ * pg_database tuples via catalog snapshots. A future snapshot must not
5904
+ * return a lower datfrozenxid for the same database OID (lower in the
5905
+ * FullTransactionIdPrecedes() sense). We achieve that since no update of a
5906
+ * tuple can start while we hold a lock on its buffer. In cases like
5907
+ * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
5908
+ * to this transaction. ROLLBACK then is one case where it's okay to lose
5909
+ * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
5910
+ * any concurrent CREATE INDEX would have blocked, then inplace-updated the
5911
+ * committed tuple.)
5912
+ *
5913
+ * In principle, we could avoid waiting by overwriting every tuple in the
5914
+ * updated tuple chain. Reader expectations permit updating a tuple only if
5915
+ * it's aborted, is the tail of the chain, or we already updated the tuple
5916
+ * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
5917
+ * order from tail to head. That would imply either (a) mutating all tuples
5918
+ * in one critical section or (b) accepting a chance of partial completion.
5919
+ * Partial completion of a relfrozenxid update would have the weird
5920
+ * consequence that the table's next VACUUM could see the table's relfrozenxid
5921
+ * move forward between vacuum_get_cutoffs() and finishing.
5922
+ */
5923
+ bool
5924
+ heap_inplace_lock (Relation relation ,
5925
+ HeapTuple oldtup_ptr ,Buffer buffer )
5926
+ {
5927
+ HeapTupleData oldtup = * oldtup_ptr ;/* minimize diff vs. heap_update() */
5928
+ TM_Result result ;
5929
+ bool ret ;
5930
+
5931
+ Assert (BufferIsValid (buffer ));
5932
+
5933
+ LockBuffer (buffer ,BUFFER_LOCK_EXCLUSIVE );
5934
+
5935
+ /*----------
5936
+ * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
5937
+ *
5938
+ * - wait unconditionally
5939
+ * - no tuple locks
5940
+ * - don't recheck header after wait: simpler to defer to next iteration
5941
+ * - don't try to continue even if the updater aborts: likewise
5942
+ * - no crosscheck
5943
+ */
5944
+ result = HeapTupleSatisfiesUpdate (& oldtup ,GetCurrentCommandId (false),
5945
+ buffer );
5946
+
5947
+ if (result == TM_Invisible )
5948
+ {
5949
+ /* no known way this can happen */
5950
+ ereport (ERROR ,
5951
+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5952
+ errmsg_internal ("attempted to overwrite invisible tuple" )));
5953
+ }
5954
+ else if (result == TM_SelfModified )
5955
+ {
5956
+ /*
5957
+ * CREATE INDEX might reach this if an expression is silly enough to
5958
+ * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
5959
+ * statements might get here after a heap_update() of the same row, in
5960
+ * the absence of an intervening CommandCounterIncrement().
5961
+ */
5962
+ ereport (ERROR ,
5963
+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5964
+ errmsg ("tuple to be updated was already modified by an operation triggered by the current command" )));
5965
+ }
5966
+ else if (result == TM_BeingModified )
5967
+ {
5968
+ TransactionId xwait ;
5969
+ uint16 infomask ;
5970
+
5971
+ xwait = HeapTupleHeaderGetRawXmax (oldtup .t_data );
5972
+ infomask = oldtup .t_data -> t_infomask ;
5973
+
5974
+ if (infomask & HEAP_XMAX_IS_MULTI )
5975
+ {
5976
+ LockTupleMode lockmode = LockTupleNoKeyExclusive ;
5977
+ MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate ;
5978
+ int remain ;
5979
+ bool current_is_member ;
5980
+
5981
+ if (DoesMultiXactIdConflict ((MultiXactId )xwait ,infomask ,
5982
+ lockmode ,& current_is_member ))
5983
+ {
5984
+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
5985
+ ret = false;
5986
+ MultiXactIdWait ((MultiXactId )xwait ,mxact_status ,infomask ,
5987
+ relation ,& oldtup .t_self ,XLTW_Update ,
5988
+ & remain );
5989
+ }
5990
+ else
5991
+ ret = true;
5992
+ }
5993
+ else if (TransactionIdIsCurrentTransactionId (xwait ))
5994
+ ret = true;
5995
+ else if (HEAP_XMAX_IS_KEYSHR_LOCKED (infomask ))
5996
+ ret = true;
5997
+ else
5998
+ {
5999
+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
6000
+ ret = false;
6001
+ XactLockTableWait (xwait ,relation ,& oldtup .t_self ,
6002
+ XLTW_Update );
6003
+ }
6004
+ }
6005
+ else
6006
+ {
6007
+ ret = (result == TM_Ok );
6008
+ if (!ret )
6009
+ {
6010
+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
6011
+ }
6012
+ }
6013
+
6014
+ /*
6015
+ * GetCatalogSnapshot() relies on invalidation messages to know when to
6016
+ * take a new snapshot. COMMIT of xwait is responsible for sending the
6017
+ * invalidation. We're not acquiring heavyweight locks sufficient to
6018
+ * block if not yet sent, so we must take a new snapshot to ensure a later
6019
+ * attempt has a fair chance. While we don't need this if xwait aborted,
6020
+ * don't bother optimizing that.
6021
+ */
6022
+ if (!ret )
6023
+ InvalidateCatalogSnapshot ();
6024
+ return ret ;
6025
+ }
6026
+
6027
+ /*
6028
+ * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
5893
6029
*
5894
- * tuple is an in-memory tuple structure containing the data to be written
5895
- * over the target tuple. Also, tuple->t_self identifies the target tuple.
6030
+ * The tuple cannot change size, and therefore its header fields and null
6031
+ * bitmap (if any) don't change either.
6032
+ */
6033
+ void
6034
+ heap_inplace_update_and_unlock (Relation relation ,
6035
+ HeapTuple oldtup ,HeapTuple tuple ,
6036
+ Buffer buffer )
6037
+ {
6038
+ HeapTupleHeader htup = oldtup -> t_data ;
6039
+ uint32 oldlen ;
6040
+ uint32 newlen ;
6041
+
6042
+ Assert (ItemPointerEquals (& oldtup -> t_self ,& tuple -> t_self ));
6043
+ oldlen = oldtup -> t_len - htup -> t_hoff ;
6044
+ newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
6045
+ if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
6046
+ elog (ERROR ,"wrong tuple length" );
6047
+
6048
+ /* NO EREPORT(ERROR) from here till changes are logged */
6049
+ START_CRIT_SECTION ();
6050
+
6051
+ memcpy ((char * )htup + htup -> t_hoff ,
6052
+ (char * )tuple -> t_data + tuple -> t_data -> t_hoff ,
6053
+ newlen );
6054
+
6055
+ /*----------
6056
+ * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
6057
+ *
6058
+ * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6059
+ * ["R" is a VACUUM tbl]
6060
+ * D: vac_update_datfrozenid() -> systable_beginscan(pg_class)
6061
+ * D: systable_getnext() returns pg_class tuple of tbl
6062
+ * R: memcpy() into pg_class tuple of tbl
6063
+ * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6064
+ * [crash]
6065
+ * [recovery restores datfrozenxid w/o relfrozenxid]
6066
+ */
6067
+
6068
+ MarkBufferDirty (buffer );
6069
+
6070
+ /* XLOG stuff */
6071
+ if (RelationNeedsWAL (relation ))
6072
+ {
6073
+ xl_heap_inplace xlrec ;
6074
+ XLogRecPtr recptr ;
6075
+
6076
+ xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
6077
+
6078
+ XLogBeginInsert ();
6079
+ XLogRegisterData ((char * )& xlrec ,SizeOfHeapInplace );
6080
+
6081
+ XLogRegisterBuffer (0 ,buffer ,REGBUF_STANDARD );
6082
+ XLogRegisterBufData (0 , (char * )htup + htup -> t_hoff ,newlen );
6083
+
6084
+ /* inplace updates aren't decoded atm, don't log the origin */
6085
+
6086
+ recptr = XLogInsert (RM_HEAP_ID ,XLOG_HEAP_INPLACE );
6087
+
6088
+ PageSetLSN (BufferGetPage (buffer ),recptr );
6089
+ }
6090
+
6091
+ END_CRIT_SECTION ();
6092
+
6093
+ heap_inplace_unlock (relation ,oldtup ,buffer );
6094
+
6095
+ /*
6096
+ * Send out shared cache inval if necessary. Note that because we only
6097
+ * pass the new version of the tuple, this mustn't be used for any
6098
+ * operations that could change catcache lookup keys. But we aren't
6099
+ * bothering with index updates either, so that's true a fortiori.
6100
+ *
6101
+ * XXX ROLLBACK discards the invalidation. See test inplace-inval.spec.
6102
+ */
6103
+ if (!IsBootstrapProcessingMode ())
6104
+ CacheInvalidateHeapTuple (relation ,tuple ,NULL );
6105
+ }
6106
+
6107
+ /*
6108
+ * heap_inplace_unlock - reverse of heap_inplace_lock
6109
+ */
6110
+ void
6111
+ heap_inplace_unlock (Relation relation ,
6112
+ HeapTuple oldtup ,Buffer buffer )
6113
+ {
6114
+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
6115
+ }
6116
+
6117
+ /*
6118
+ * heap_inplace_update - deprecated
5896
6119
*
5897
- * Note that the tuple updated here had better not come directly from the
5898
- * syscache if the relation has a toast relation as this tuple could
5899
- * include toast values that have been expanded, causing a failure here.
6120
+ * This exists only to keep modules working in back branches. Affected
6121
+ * modules should migrate to systable_inplace_update_begin().
5900
6122
*/
5901
6123
void
5902
6124
heap_inplace_update (Relation relation ,HeapTuple tuple )