88 *
99 *
1010 * IDENTIFICATION
11- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.122 2006/10/04 00:29:58 momjian Exp $
11+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
3535 *descriptors in its own descriptor pool. This is done to make it
3636 *easier to support relations that are larger than the operating
3737 *system's file size limit (often 2GBytes). In order to do that,
38- *we break relations up into chunks of < 2GBytes and store one chunk
39- *in each of several files that represent the relation. See the
40- *BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
41- *All chunks except the last MUST have size exactly equal to RELSEG_SIZE
42- *blocks --- see mdnblocks() and mdtruncate().
38+ *we break relations up into "segment" files that are each shorter than
39+ *the OS file size limit. The segment size is set by the RELSEG_SIZE
40+ *configuration constant in pg_config_manual.h.
41+ *
42+ *On disk, a relation must consist of consecutively numbered segment
43+ *files in the pattern
44+ *-- Zero or more full segments of exactly RELSEG_SIZE blocks each
45+ *-- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
46+ *-- Optionally, any number of inactive segments of size 0 blocks.
47+ *The full and partial segments are collectively the "active" segments.
48+ *Inactive segments are those that once contained data but are currently
49+ *not needed because of an mdtruncate() operation. The reason for leaving
50+ *them present at size zero, rather than unlinking them, is that other
51+ *backends and/or the bgwriter might be holding open file references to
52+ *such segments. If the relation expands again after mdtruncate(), such
53+ *that a deactivated segment becomes active again, it is important that
54+ *such file references still be valid --- else data might get written
55+ *out to an unlinked old copy of a segment file that will eventually
56+ *disappear.
4357 *
4458 *The file descriptor pointer (md_fd field) stored in the SMgrRelation
45- *cache is, therefore, just the head of a list of MdfdVec objects.
46- *But note the md_fd pointer can be NULL, indicating relation not open.
59+ *cache is, therefore, just the head of a list of MdfdVec objects, one
60+ *per segment. But note the md_fd pointer can be NULL, indicating
61+ *relation not open.
4762 *
48- *Note that mdfd_chain == NULL does not necessarily mean the relation
63+ *Also note that mdfd_chain == NULL does not necessarily mean the relation
4964 *doesn't have another segment after this one; we may just not have
5065 *opened the next segment yet. (We could not have "all segments are
5166 *in the chain" as an invariant anyway, since another backend could
52- *extend the relation when we weren't looking.)
67+ *extend the relation when we weren't looking.) We do not make chain
68+ *entries for inactive segments, however; as soon as we find a partial
69+ *segment, we assume that any subsequent segments are inactive.
5370 *
5471 *All MdfdVec objects are palloc'd in the MdCxt memory context.
72+ *
73+ *Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
74+ *for use on machines that support large files. Beware that that
75+ *code has not been tested in a long time and is probably bit-rotted.
5576 */
5677
5778typedef struct _MdfdVec
@@ -77,8 +98,6 @@ static MemoryContext MdCxt;/* context for all md.c allocations */
7798 *
7899 * (Regular backends do not track pending operations locally, but forward
79100 * them to the bgwriter.)
80- *
81- * XXX for WIN32, may want to expand this to track pending deletes, too.
82101 */
83102typedef struct
84103{
@@ -222,12 +241,16 @@ mdunlink(RelFileNode rnode, bool isRedo)
222241}
223242
224243#ifndef LET_OS_MANAGE_FILESIZE
225- /*Get the additional segments, if any */
244+ /*Delete the additional segments, if any */
226245if (status )
227246{
228247char * segpath = (char * )palloc (strlen (path )+ 12 );
229248BlockNumber segno ;
230249
250+ /*
251+ * Note that because we loop until getting ENOENT, we will
252+ * correctly remove all inactive segments as well as active ones.
253+ */
231254for (segno = 1 ;;segno ++ )
232255{
233256sprintf (segpath ,"%s.%u" ,path ,segno );
@@ -257,15 +280,10 @@ mdunlink(RelFileNode rnode, bool isRedo)
257280 *
258281 *The semantics are basically the same as mdwrite(): write at the
259282 *specified position. However, we are expecting to extend the
260- *relation (ie, blocknum is the current EOF), and so in case of
283+ *relation (ie, blocknum is>= the current EOF), and so in case of
261284 *failure we clean up by truncating.
262285 *
263286 *This routine returns true or false, with errno set as appropriate.
264- *
265- * Note: this routine used to call mdnblocks() to get the block position
266- * to write at, but that's pretty silly since the caller needs to know where
267- * the block will be written, and accordingly must have done mdnblocks()
268- * already. Might as well pass in the position and save a seek.
269287 */
270288bool
271289mdextend (SMgrRelation reln ,BlockNumber blocknum ,char * buffer ,bool isTemp )
@@ -498,10 +516,10 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
498516/*
499517 *mdnblocks() -- Get the number of blocks stored in a relation.
500518 *
501- *Important side effect: all segments of the relation are opened
519+ *Important side effect: allactive segments of the relation are opened
502520 *and added to the mdfd_chain list. If this routine has not been
503521 *called, then only segments up to the last one actually touched
504- *are present in the chain...
522+ *are present in the chain.
505523 *
506524 *Returns # of blocks, or InvalidBlockNumber on error.
507525 */
@@ -518,9 +536,13 @@ mdnblocks(SMgrRelation reln)
518536 * Skip through any segments that aren't the last one, to avoid redundant
519537 * seeks on them. We have previously verified that these segments are
520538 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
521- * (NOTE: this assumption could only be wrong if another backend has
539+ *
540+ * NOTE: this assumption could only be wrong if another backend has
522541 * truncated the relation.We rely on higher code levels to handle that
523- * scenario by closing and re-opening the md fd.)
542+ * scenario by closing and re-opening the md fd, which is handled via
543+ * relcache flush. (Since the bgwriter doesn't participate in relcache
544+ * flush, it could have segment chain entries for inactive segments;
545+ * that's OK because the bgwriter never needs to compute relation size.)
524546 */
525547while (v -> mdfd_chain != NULL )
526548{
@@ -546,8 +568,8 @@ mdnblocks(SMgrRelation reln)
546568/*
547569 * Because we pass O_CREAT, we will create the next segment (with
548570 * zero length) immediately, if the last segment is of length
549- *REL_SEGSIZE .This is unnecessary but harmless, and testing for
550- * thecase would take more cycles than it seems worth .
571+ *RELSEG_SIZE .While perhaps not strictly necessary, this keeps
572+ * thelogic simple .
551573 */
552574v -> mdfd_chain = _mdfd_openseg (reln ,segno ,O_CREAT );
553575if (v -> mdfd_chain == NULL )
@@ -577,8 +599,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
577599#endif
578600
579601/*
580- * NOTE: mdnblocks makes sure we have opened allexisting segments, so
581- * thattruncate/delete loop will get them all!
602+ * NOTE: mdnblocks makes sure we have opened allactive segments, so
603+ * thattruncation loop will get them all!
582604 */
583605curnblk = mdnblocks (reln );
584606if (curnblk == InvalidBlockNumber )
@@ -599,14 +621,17 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
599621if (priorblocks > nblocks )
600622{
601623/*
602- * This segment is no longer wanted at all (and has already been
603- * unlinked from the mdfd_chain). We truncate the file before
604- * deleting it because if other backends are holding the file
605- * open, the unlink will fail on some platforms. Better a
606- * zero-size file gets left around than a big file...
624+ * This segment is no longer active (and has already been
625+ * unlinked from the mdfd_chain). We truncate the file, but do
626+ * not delete it, for reasons explained in the header comments.
607627 */
608- FileTruncate (v -> mdfd_vfd ,0 );
609- FileUnlink (v -> mdfd_vfd );
628+ if (FileTruncate (v -> mdfd_vfd ,0 )< 0 )
629+ return InvalidBlockNumber ;
630+ if (!isTemp )
631+ {
632+ if (!register_dirty_segment (reln ,v ))
633+ return InvalidBlockNumber ;
634+ }
610635v = v -> mdfd_chain ;
611636Assert (ov != reln -> md_fd );/* we never drop the 1st segment */
612637pfree (ov );
@@ -618,8 +643,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
618643 * the right length, and clear chain link that points to any
619644 * remaining segments (which we shall zap). NOTE: if nblocks is
620645 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
621- * segment to 0 length but keep it. Thisis mainly so that the
622- *right thing happens if nblocks==0 .
646+ * segment to 0 length but keep it. Thisadheres to the invariant
647+ *given in the header comments .
623648 */
624649BlockNumber lastsegblocks = nblocks - priorblocks ;
625650
@@ -669,7 +694,7 @@ mdimmedsync(SMgrRelation reln)
669694BlockNumber curnblk ;
670695
671696/*
672- * NOTE: mdnblocks makes sure we have opened allexisting segments, so
697+ * NOTE: mdnblocks makes sure we have opened allactive segments, so
673698 * that fsync loop will get them all!
674699 */
675700curnblk = mdnblocks (reln );