88 *
99 *
1010 * IDENTIFICATION
11- * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.65 2000/04/09 04:43:20 tgl Exp $
11+ * $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.66 2000/04/10 23:41:51 tgl Exp $
1212 *
1313 *-------------------------------------------------------------------------
1414 */
4848typedef struct _MdfdVec
4949{
5050int mdfd_vfd ;/* fd number in vfd pool */
51- int mdfd_flags ;/*free, temporary */
51+ int mdfd_flags ;/*fd status flags */
5252
5353/* these are the assigned bits in mdfd_flags: */
5454#define MDFD_FREE (1 << 0)/* unused entry */
55- #define MDFD_TEMP (1 << 1)/* close this entry at transaction end */
5655
5756int mdfd_lstbcnt ;/* most recent block count */
5857int mdfd_nextFree ;/* next free vector */
@@ -72,8 +71,8 @@ static void mdclose_fd(int fd);
7271static int _mdfd_getrelnfd (Relation reln );
7372static MdfdVec * _mdfd_openseg (Relation reln ,int segno ,int oflags );
7473static MdfdVec * _mdfd_getseg (Relation reln ,int blkno );
75- static MdfdVec * _mdfd_blind_getseg (char * dbname ,char * relname ,
76- Oid dbid ,Oid relid ,int blkno );
74+ static int _mdfd_blind_getseg (char * dbname ,char * relname ,
75+ Oid dbid ,Oid relid ,int blkno );
7776static int _fdvec_alloc (void );
7877static void _fdvec_free (int );
7978static BlockNumber _mdnblocks (File file ,Size blcksz );
@@ -572,23 +571,25 @@ mdflush(Relation reln, BlockNumber blocknum, char *buffer)
572571 *
573572 *We have to be able to do this using only the name and OID of
574573 *the database and relation in which the block belongs. Otherwise
575- *this is just like mdwrite().
574+ *this is much like mdwrite(). If dofsync is TRUE, then we fsync
575+ *the file, making it more like mdflush().
576576 */
577577int
578578mdblindwrt (char * dbname ,
579579char * relname ,
580580Oid dbid ,
581581Oid relid ,
582582BlockNumber blkno ,
583- char * buffer )
583+ char * buffer ,
584+ bool dofsync )
584585{
585586int status ;
586587long seekpos ;
587- MdfdVec * v ;
588+ int fd ;
588589
589- v = _mdfd_blind_getseg (dbname ,relname ,dbid ,relid ,blkno );
590+ fd = _mdfd_blind_getseg (dbname ,relname ,dbid ,relid ,blkno );
590591
591- if (v == NULL )
592+ if (fd < 0 )
592593return SM_FAIL ;
593594
594595#ifndef LET_OS_MANAGE_FILESIZE
@@ -601,11 +602,22 @@ mdblindwrt(char *dbname,
601602seekpos = (long ) (BLCKSZ * (blkno ));
602603#endif
603604
604- if (FileSeek (v -> mdfd_vfd ,seekpos ,SEEK_SET )!= seekpos )
605+ if (lseek (fd ,seekpos ,SEEK_SET )!= seekpos )
606+ {
607+ close (fd );
605608return SM_FAIL ;
609+ }
606610
607611status = SM_SUCCESS ;
608- if (FileWrite (v -> mdfd_vfd ,buffer ,BLCKSZ )!= BLCKSZ )
612+
613+ /* write and optionally sync the block */
614+ if (write (fd ,buffer ,BLCKSZ )!= BLCKSZ )
615+ status = SM_FAIL ;
616+ else if (dofsync &&
617+ pg_fsync (fd )< 0 )
618+ status = SM_FAIL ;
619+
620+ if (close (fd )< 0 )
609621status = SM_FAIL ;
610622
611623return status ;
@@ -633,7 +645,8 @@ mdmarkdirty(Relation reln, BlockNumber blkno)
633645 *
634646 *We have to be able to do this using only the name and OID of
635647 *the database and relation in which the block belongs. Otherwise
636- *this is just like mdmarkdirty().
648+ *this is much like mdmarkdirty(). However, we do the fsync immediately
649+ *rather than building md/fd datastructures to postpone it till later.
637650 */
638651int
639652mdblindmarkdirty (char * dbname ,
@@ -642,16 +655,23 @@ mdblindmarkdirty(char *dbname,
642655Oid relid ,
643656BlockNumber blkno )
644657{
645- MdfdVec * v ;
658+ int status ;
659+ int fd ;
646660
647- v = _mdfd_blind_getseg (dbname ,relname ,dbid ,relid ,blkno );
661+ fd = _mdfd_blind_getseg (dbname ,relname ,dbid ,relid ,blkno );
648662
649- if (v == NULL )
663+ if (fd < 0 )
650664return SM_FAIL ;
651665
652- FileMarkDirty ( v -> mdfd_vfd ) ;
666+ status = SM_SUCCESS ;
653667
654- return SM_SUCCESS ;
668+ if (pg_fsync (fd )< 0 )
669+ status = SM_FAIL ;
670+
671+ if (close (fd )< 0 )
672+ status = SM_FAIL ;
673+
674+ return status ;
655675}
656676
657677/*
@@ -820,24 +840,15 @@ mdcommit()
820840v = & Md_fdvec [i ];
821841if (v -> mdfd_flags & MDFD_FREE )
822842continue ;
823- if (v -> mdfd_flags & MDFD_TEMP )
824- {
825- /* Sync and close the file */
826- mdclose_fd (i );
827- }
828- else
829- {
830- /* Sync, but keep the file entry */
831-
843+ /* Sync the file entry */
832844#ifndef LET_OS_MANAGE_FILESIZE
833- for ( ;v != (MdfdVec * )NULL ;v = v -> mdfd_chain )
845+ for ( ;v != (MdfdVec * )NULL ;v = v -> mdfd_chain )
834846#else
835- if (v != (MdfdVec * )NULL )
847+ if (v != (MdfdVec * )NULL )
836848#endif
837- {
838- if (FileSync (v -> mdfd_vfd )< 0 )
839- return SM_FAIL ;
840- }
849+ {
850+ if (FileSync (v -> mdfd_vfd )< 0 )
851+ return SM_FAIL ;
841852}
842853}
843854
@@ -854,21 +865,9 @@ mdcommit()
854865int
855866mdabort ()
856867{
857- int i ;
858- MdfdVec * v ;
859-
860- for (i = 0 ;i < CurFd ;i ++ )
861- {
862- v = & Md_fdvec [i ];
863- if (v -> mdfd_flags & MDFD_FREE )
864- continue ;
865- if (v -> mdfd_flags & MDFD_TEMP )
866- {
867- /* Close the file */
868- mdclose_fd (i );
869- }
870- }
871-
868+ /* We don't actually have to do anything here. fd.c will discard
869+ * fsync-needed bits in its AtEOXact_Files() routine.
870+ */
872871return SM_SUCCESS ;
873872}
874873
@@ -1057,102 +1056,52 @@ _mdfd_getseg(Relation reln, int blkno)
10571056return v ;
10581057}
10591058
1060- /* Find the segment of the relation holding the specified block.
1061- * This is the same as _mdfd_getseg() except that we must work
1062- * "blind" with no Relation struct.
1059+ /*
1060+ * Find the segment of the relation holding the specified block.
10631061 *
1064- * NOTE: we have no easy way to tell whether a FD already exists for the
1065- * target relation, so we always make a new one. This should probably
1066- * be improved somehow, but I doubt it's a significant performance issue
1067- * under normal circumstances. The FD is marked to be closed at end of xact
1068- * so that we don't accumulate a lot of dead FDs.
1062+ * This performs the same work as _mdfd_getseg() except that we must work
1063+ * "blind" with no Relation struct. We assume that we are not likely to
1064+ * touch the same relation again soon, so we do not create an FD entry for
1065+ * the relation --- we just open a kernel file descriptor which will be
1066+ * used and promptly closed. The return value is the kernel descriptor,
1067+ * or -1 on failure.
10691068 */
10701069
1071- static MdfdVec *
1070+ static int
10721071_mdfd_blind_getseg (char * dbname ,char * relname ,Oid dbid ,Oid relid ,
10731072int blkno )
10741073{
1075- MdfdVec * v ;
10761074char * path ;
10771075int fd ;
1078- int vfd ;
10791076#ifndef LET_OS_MANAGE_FILESIZE
10801077int segno ;
1081- int targsegno ;
10821078#endif
10831079
1084- /* construct the path to thefile and open it */
1080+ /* construct the path to therelation */
10851081path = relpath_blind (dbname ,relname ,dbid ,relid );
10861082
1087- #ifndef __CYGWIN32__
1088- fd = FileNameOpenFile (path ,O_RDWR ,0600 );
1089- #else
1090- fd = FileNameOpenFile (path ,O_RDWR |O_BINARY ,0600 );
1091- #endif
1092-
1093- if (fd < 0 )
1094- return NULL ;
1095-
1096- vfd = _fdvec_alloc ();
1097- if (vfd < 0 )
1098- return NULL ;
1099-
1100- Md_fdvec [vfd ].mdfd_vfd = fd ;
1101- Md_fdvec [vfd ].mdfd_flags = MDFD_TEMP ;
1102- Md_fdvec [vfd ].mdfd_lstbcnt = _mdnblocks (fd ,BLCKSZ );
11031083#ifndef LET_OS_MANAGE_FILESIZE
1104- Md_fdvec [vfd ].mdfd_chain = (MdfdVec * )NULL ;
1105-
1106- #ifdef DIAGNOSTIC
1107- if (Md_fdvec [vfd ].mdfd_lstbcnt > RELSEG_SIZE )
1108- elog (FATAL ,"segment too big on relopen!" );
1109- #endif
1110-
1111- targsegno = blkno /RELSEG_SIZE ;
1112- for (v = & Md_fdvec [vfd ],segno = 1 ;segno <=targsegno ;segno ++ )
1084+ /* append the '.segno', if needed */
1085+ segno = blkno /RELSEG_SIZE ;
1086+ if (segno > 0 )
11131087{
1114- char * segpath ;
1115- MdfdVec * newv ;
1116- MemoryContext oldcxt ;
1088+ char * segpath = (char * )palloc (strlen (path )+ 12 );
11171089
1118- segpath = (char * )palloc (strlen (path )+ 12 );
11191090sprintf (segpath ,"%s.%d" ,path ,segno );
1120-
1121- #ifndef __CYGWIN32__
1122- fd = FileNameOpenFile (segpath ,O_RDWR |O_CREAT ,0600 );
1123- #else
1124- fd = FileNameOpenFile (segpath ,O_RDWR |O_BINARY |O_CREAT ,0600 );
1091+ pfree (path );
1092+ path = segpath ;
1093+ }
11251094#endif
11261095
1127- pfree (segpath );
1128-
1129- if (fd < 0 )
1130- return (MdfdVec * )NULL ;
1131-
1132- /* allocate an mdfdvec entry for it */
1133- oldcxt = MemoryContextSwitchTo (MdCxt );
1134- newv = (MdfdVec * )palloc (sizeof (MdfdVec ));
1135- MemoryContextSwitchTo (oldcxt );
1136-
1137- /* fill the entry */
1138- newv -> mdfd_vfd = fd ;
1139- newv -> mdfd_flags = MDFD_TEMP ;
1140- newv -> mdfd_lstbcnt = _mdnblocks (fd ,BLCKSZ );
1141- newv -> mdfd_chain = (MdfdVec * )NULL ;
1142- #ifdef DIAGNOSTIC
1143- if (newv -> mdfd_lstbcnt > RELSEG_SIZE )
1144- elog (FATAL ,"segment too big on open!" );
1145- #endif
1146- v -> mdfd_chain = newv ;
1147- v = newv ;
1148- }
1096+ #ifndef __CYGWIN32__
1097+ fd = open (path ,O_RDWR ,0600 );
11491098#else
1150- v = & Md_fdvec [ vfd ] ;
1099+ fd = open ( path , O_RDWR | O_BINARY , 0600 ) ;
11511100#endif
11521101
11531102pfree (path );
11541103
1155- return v ;
1104+ return fd ;
11561105}
11571106
11581107static BlockNumber