@@ -5800,23 +5800,245 @@ heap_abort_speculative(Relation relation, ItemPointer tid)
58005800}
58015801
58025802/*
5803- *heap_inplace_update -update a tuple "in place" (ie, overwrite it )
5803+ *heap_inplace_lock -protect inplace update from concurrent heap_update( )
58045804 *
5805- * Overwriting violates both MVCC and transactional safety, so the uses
5806- * of this function in Postgres are extremely limited. Nonetheless we
5807- * find some places to use it.
5805+ * Evaluate whether the tuple's state is compatible with a no-key update.
5806+ * Current transaction rowmarks are fine, as is KEY SHARE from any
5807+ * transaction. If compatible, return true with the buffer exclusive-locked,
5808+ * and the caller must release that by calling
5809+ * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
5810+ * an error. Otherwise, return false after blocking transactions, if any,
5811+ * have ended.
58085812 *
5809- * The tuple cannot change size, and therefore it's reasonable to assume
5810- * that its null bitmap (if any) doesn't change either. So we just
5811- * overwrite the data portion of the tuple without touching the null
5812- * bitmap or any of the header fields.
5813+ * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
5814+ * DDL, this doesn't guarantee any particular predicate locking.
58135815 *
5814- * tuple is an in-memory tuple structure containing the data to be written
5815- * over the target tuple. Also, tuple->t_self identifies the target tuple.
5816+ * One could modify this to return true for tuples with delete in progress,
5817+ * All inplace updaters take a lock that conflicts with DROP. If explicit
5818+ * "DELETE FROM pg_class" is in progress, we'll wait for it like we would an
5819+ * update.
58165820 *
5817- * Note that the tuple updated here had better not come directly from the
5818- * syscache if the relation has a toast relation as this tuple could
5819- * include toast values that have been expanded, causing a failure here.
5821+ * Readers of inplace-updated fields expect changes to those fields are
5822+ * durable. For example, vac_truncate_clog() reads datfrozenxid from
5823+ * pg_database tuples via catalog snapshots. A future snapshot must not
5824+ * return a lower datfrozenxid for the same database OID (lower in the
5825+ * FullTransactionIdPrecedes() sense). We achieve that since no update of a
5826+ * tuple can start while we hold a lock on its buffer. In cases like
5827+ * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
5828+ * to this transaction. ROLLBACK then is one case where it's okay to lose
5829+ * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
5830+ * any concurrent CREATE INDEX would have blocked, then inplace-updated the
5831+ * committed tuple.)
5832+ *
5833+ * In principle, we could avoid waiting by overwriting every tuple in the
5834+ * updated tuple chain. Reader expectations permit updating a tuple only if
5835+ * it's aborted, is the tail of the chain, or we already updated the tuple
5836+ * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
5837+ * order from tail to head. That would imply either (a) mutating all tuples
5838+ * in one critical section or (b) accepting a chance of partial completion.
5839+ * Partial completion of a relfrozenxid update would have the weird
5840+ * consequence that the table's next VACUUM could see the table's relfrozenxid
5841+ * move forward between vacuum_get_cutoffs() and finishing.
5842+ */
5843+ bool
5844+ heap_inplace_lock (Relation relation ,
5845+ HeapTuple oldtup_ptr ,Buffer buffer )
5846+ {
5847+ HeapTupleData oldtup = * oldtup_ptr ;/* minimize diff vs. heap_update() */
5848+ TM_Result result ;
5849+ bool ret ;
5850+
5851+ Assert (BufferIsValid (buffer ));
5852+
5853+ LockBuffer (buffer ,BUFFER_LOCK_EXCLUSIVE );
5854+
5855+ /*----------
5856+ * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
5857+ *
5858+ * - wait unconditionally
5859+ * - no tuple locks
5860+ * - don't recheck header after wait: simpler to defer to next iteration
5861+ * - don't try to continue even if the updater aborts: likewise
5862+ * - no crosscheck
5863+ */
5864+ result = HeapTupleSatisfiesUpdate (& oldtup ,GetCurrentCommandId (false),
5865+ buffer );
5866+
5867+ if (result == TM_Invisible )
5868+ {
5869+ /* no known way this can happen */
5870+ ereport (ERROR ,
5871+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5872+ errmsg_internal ("attempted to overwrite invisible tuple" )));
5873+ }
5874+ else if (result == TM_SelfModified )
5875+ {
5876+ /*
5877+ * CREATE INDEX might reach this if an expression is silly enough to
5878+ * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
5879+ * statements might get here after a heap_update() of the same row, in
5880+ * the absence of an intervening CommandCounterIncrement().
5881+ */
5882+ ereport (ERROR ,
5883+ (errcode (ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE ),
5884+ errmsg ("tuple to be updated was already modified by an operation triggered by the current command" )));
5885+ }
5886+ else if (result == TM_BeingModified )
5887+ {
5888+ TransactionId xwait ;
5889+ uint16 infomask ;
5890+
5891+ xwait = HeapTupleHeaderGetRawXmax (oldtup .t_data );
5892+ infomask = oldtup .t_data -> t_infomask ;
5893+
5894+ if (infomask & HEAP_XMAX_IS_MULTI )
5895+ {
5896+ LockTupleMode lockmode = LockTupleNoKeyExclusive ;
5897+ MultiXactStatus mxact_status = MultiXactStatusNoKeyUpdate ;
5898+ int remain ;
5899+ bool current_is_member ;
5900+
5901+ if (DoesMultiXactIdConflict ((MultiXactId )xwait ,infomask ,
5902+ lockmode ,& current_is_member ))
5903+ {
5904+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
5905+ ret = false;
5906+ MultiXactIdWait ((MultiXactId )xwait ,mxact_status ,infomask ,
5907+ relation ,& oldtup .t_self ,XLTW_Update ,
5908+ & remain );
5909+ }
5910+ else
5911+ ret = true;
5912+ }
5913+ else if (TransactionIdIsCurrentTransactionId (xwait ))
5914+ ret = true;
5915+ else if (HEAP_XMAX_IS_KEYSHR_LOCKED (infomask ))
5916+ ret = true;
5917+ else
5918+ {
5919+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
5920+ ret = false;
5921+ XactLockTableWait (xwait ,relation ,& oldtup .t_self ,
5922+ XLTW_Update );
5923+ }
5924+ }
5925+ else
5926+ {
5927+ ret = (result == TM_Ok );
5928+ if (!ret )
5929+ {
5930+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
5931+ }
5932+ }
5933+
5934+ /*
5935+ * GetCatalogSnapshot() relies on invalidation messages to know when to
5936+ * take a new snapshot. COMMIT of xwait is responsible for sending the
5937+ * invalidation. We're not acquiring heavyweight locks sufficient to
5938+ * block if not yet sent, so we must take a new snapshot to ensure a later
5939+ * attempt has a fair chance. While we don't need this if xwait aborted,
5940+ * don't bother optimizing that.
5941+ */
5942+ if (!ret )
5943+ InvalidateCatalogSnapshot ();
5944+ return ret ;
5945+ }
5946+
5947+ /*
5948+ * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
5949+ *
5950+ * The tuple cannot change size, and therefore its header fields and null
5951+ * bitmap (if any) don't change either.
5952+ */
5953+ void
5954+ heap_inplace_update_and_unlock (Relation relation ,
5955+ HeapTuple oldtup ,HeapTuple tuple ,
5956+ Buffer buffer )
5957+ {
5958+ HeapTupleHeader htup = oldtup -> t_data ;
5959+ uint32 oldlen ;
5960+ uint32 newlen ;
5961+
5962+ Assert (ItemPointerEquals (& oldtup -> t_self ,& tuple -> t_self ));
5963+ oldlen = oldtup -> t_len - htup -> t_hoff ;
5964+ newlen = tuple -> t_len - tuple -> t_data -> t_hoff ;
5965+ if (oldlen != newlen || htup -> t_hoff != tuple -> t_data -> t_hoff )
5966+ elog (ERROR ,"wrong tuple length" );
5967+
5968+ /* NO EREPORT(ERROR) from here till changes are logged */
5969+ START_CRIT_SECTION ();
5970+
5971+ memcpy ((char * )htup + htup -> t_hoff ,
5972+ (char * )tuple -> t_data + tuple -> t_data -> t_hoff ,
5973+ newlen );
5974+
5975+ /*----------
5976+ * XXX A crash here can allow datfrozenxid() to get ahead of relfrozenxid:
5977+ *
5978+ * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
5979+ * ["R" is a VACUUM tbl]
5980+ * D: vac_update_datfrozenid() -> systable_beginscan(pg_class)
5981+ * D: systable_getnext() returns pg_class tuple of tbl
5982+ * R: memcpy() into pg_class tuple of tbl
5983+ * D: raise pg_database.datfrozenxid, XLogInsert(), finish
5984+ * [crash]
5985+ * [recovery restores datfrozenxid w/o relfrozenxid]
5986+ */
5987+
5988+ MarkBufferDirty (buffer );
5989+
5990+ /* XLOG stuff */
5991+ if (RelationNeedsWAL (relation ))
5992+ {
5993+ xl_heap_inplace xlrec ;
5994+ XLogRecPtr recptr ;
5995+
5996+ xlrec .offnum = ItemPointerGetOffsetNumber (& tuple -> t_self );
5997+
5998+ XLogBeginInsert ();
5999+ XLogRegisterData ((char * )& xlrec ,SizeOfHeapInplace );
6000+
6001+ XLogRegisterBuffer (0 ,buffer ,REGBUF_STANDARD );
6002+ XLogRegisterBufData (0 , (char * )htup + htup -> t_hoff ,newlen );
6003+
6004+ /* inplace updates aren't decoded atm, don't log the origin */
6005+
6006+ recptr = XLogInsert (RM_HEAP_ID ,XLOG_HEAP_INPLACE );
6007+
6008+ PageSetLSN (BufferGetPage (buffer ),recptr );
6009+ }
6010+
6011+ END_CRIT_SECTION ();
6012+
6013+ heap_inplace_unlock (relation ,oldtup ,buffer );
6014+
6015+ /*
6016+ * Send out shared cache inval if necessary. Note that because we only
6017+ * pass the new version of the tuple, this mustn't be used for any
6018+ * operations that could change catcache lookup keys. But we aren't
6019+ * bothering with index updates either, so that's true a fortiori.
6020+ *
6021+ * XXX ROLLBACK discards the invalidation. See test inplace-inval.spec.
6022+ */
6023+ if (!IsBootstrapProcessingMode ())
6024+ CacheInvalidateHeapTuple (relation ,tuple ,NULL );
6025+ }
6026+
6027+ /*
6028+ * heap_inplace_unlock - reverse of heap_inplace_lock
6029+ */
6030+ void
6031+ heap_inplace_unlock (Relation relation ,
6032+ HeapTuple oldtup ,Buffer buffer )
6033+ {
6034+ LockBuffer (buffer ,BUFFER_LOCK_UNLOCK );
6035+ }
6036+
6037+ /*
6038+ * heap_inplace_update - deprecated
6039+ *
6040+ * This exists only to keep modules working in back branches. Affected
6041+ * modules should migrate to systable_inplace_update_begin().
58206042 */
58216043void
58226044heap_inplace_update (Relation relation ,HeapTuple tuple )