|
28 | 28 | #include"access/xlog.h"
|
29 | 29 | #include"access/xlogutils.h"
|
30 | 30 | #include"commands/tablespace.h"
|
| 31 | +#include"common/file_utils.h" |
31 | 32 | #include"miscadmin.h"
|
32 | 33 | #include"pg_trace.h"
|
33 | 34 | #include"pgstat.h"
|
@@ -754,138 +755,274 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
|
754 | 755 | }
|
755 | 756 |
|
756 | 757 | /*
|
757 |
| - * mdread() -- Read the specified block from a relation. |
| 758 | + * Convert an array of buffer address into an array of iovec objects, and |
| 759 | + * return the number that were required. 'iov' must have enough space for up |
| 760 | + * to 'nblocks' elements, but the number used may be less depending on |
| 761 | + * merging. In the case of a run of fully contiguous buffers, a single iovec |
| 762 | + * will be populated that can be handled as a plain non-vectored I/O. |
758 | 763 | */
|
759 |
| -void |
760 |
| -mdread(SMgrRelationreln,ForkNumberforknum,BlockNumberblocknum, |
761 |
| -void*buffer) |
| 764 | +staticint |
| 765 | +buffers_to_iovec(structiovec*iov,void**buffers,intnblocks) |
762 | 766 | {
|
763 |
| -off_tseekpos; |
764 |
| -intnbytes; |
765 |
| -MdfdVec*v; |
| 767 | +structiovec*iovp; |
| 768 | +intiovcnt; |
766 | 769 |
|
767 |
| -/* If this build supports direct I/O, the buffer must be I/O aligned. */ |
768 |
| -if (PG_O_DIRECT!=0&&PG_IO_ALIGN_SIZE <=BLCKSZ) |
769 |
| -Assert((uintptr_t)buffer==TYPEALIGN(PG_IO_ALIGN_SIZE,buffer)); |
| 770 | +Assert(nblocks >=1); |
770 | 771 |
|
771 |
| -TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum,blocknum, |
772 |
| -reln->smgr_rlocator.locator.spcOid, |
773 |
| -reln->smgr_rlocator.locator.dbOid, |
774 |
| -reln->smgr_rlocator.locator.relNumber, |
775 |
| -reln->smgr_rlocator.backend); |
776 |
| - |
777 |
| -v=_mdfd_getseg(reln,forknum,blocknum, false, |
778 |
| -EXTENSION_FAIL |EXTENSION_CREATE_RECOVERY); |
| 772 | +/* If this build supports direct I/O, buffers must be I/O aligned. */ |
| 773 | +for (inti=0;i<nblocks;++i) |
| 774 | +{ |
| 775 | +if (PG_O_DIRECT!=0&&PG_IO_ALIGN_SIZE <=BLCKSZ) |
| 776 | +Assert((uintptr_t)buffers[i]== |
| 777 | +TYPEALIGN(PG_IO_ALIGN_SIZE,buffers[i])); |
| 778 | +} |
779 | 779 |
|
780 |
| -seekpos= (off_t)BLCKSZ* (blocknum % ((BlockNumber)RELSEG_SIZE)); |
| 780 | +/* Start the first iovec off with the first buffer. */ |
| 781 | +iovp=&iov[0]; |
| 782 | +iovp->iov_base=buffers[0]; |
| 783 | +iovp->iov_len=BLCKSZ; |
| 784 | +iovcnt=1; |
781 | 785 |
|
782 |
| -Assert(seekpos< (off_t)BLCKSZ*RELSEG_SIZE); |
| 786 | +/* Try to merge the rest. */ |
| 787 | +for (inti=1;i<nblocks;++i) |
| 788 | +{ |
| 789 | +void*buffer=buffers[i]; |
783 | 790 |
|
784 |
| -nbytes=FileRead(v->mdfd_vfd,buffer,BLCKSZ,seekpos,WAIT_EVENT_DATA_FILE_READ); |
| 791 | +if (((char*)iovp->iov_base+iovp->iov_len)==buffer) |
| 792 | +{ |
| 793 | +/* Contiguous with the last iovec. */ |
| 794 | +iovp->iov_len+=BLCKSZ; |
| 795 | +} |
| 796 | +else |
| 797 | +{ |
| 798 | +/* Need a new iovec. */ |
| 799 | +iovp++; |
| 800 | +iovp->iov_base=buffer; |
| 801 | +iovp->iov_len=BLCKSZ; |
| 802 | +iovcnt++; |
| 803 | +} |
| 804 | +} |
785 | 805 |
|
786 |
| -TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum,blocknum, |
787 |
| -reln->smgr_rlocator.locator.spcOid, |
788 |
| -reln->smgr_rlocator.locator.dbOid, |
789 |
| -reln->smgr_rlocator.locator.relNumber, |
790 |
| -reln->smgr_rlocator.backend, |
791 |
| -nbytes, |
792 |
| -BLCKSZ); |
| 806 | +returniovcnt; |
| 807 | +} |
793 | 808 |
|
794 |
| -if (nbytes!=BLCKSZ) |
| 809 | +/* |
| 810 | + * mdreadv() -- Read the specified blocks from a relation. |
| 811 | + */ |
| 812 | +void |
| 813 | +mdreadv(SMgrRelationreln,ForkNumberforknum,BlockNumberblocknum, |
| 814 | +void**buffers,BlockNumbernblocks) |
| 815 | +{ |
| 816 | +while (nblocks>0) |
795 | 817 | {
|
796 |
| -if (nbytes<0) |
797 |
| -ereport(ERROR, |
798 |
| -(errcode_for_file_access(), |
799 |
| -errmsg("could not read block %u in file \"%s\": %m", |
800 |
| -blocknum,FilePathName(v->mdfd_vfd)))); |
| 818 | +structioveciov[PG_IOV_MAX]; |
| 819 | +intiovcnt; |
| 820 | +off_tseekpos; |
| 821 | +intnbytes; |
| 822 | +MdfdVec*v; |
| 823 | +BlockNumbernblocks_this_segment; |
| 824 | +size_ttransferred_this_segment; |
| 825 | +size_tsize_this_segment; |
| 826 | + |
| 827 | +v=_mdfd_getseg(reln,forknum,blocknum, false, |
| 828 | +EXTENSION_FAIL |EXTENSION_CREATE_RECOVERY); |
| 829 | + |
| 830 | +seekpos= (off_t)BLCKSZ* (blocknum % ((BlockNumber)RELSEG_SIZE)); |
| 831 | + |
| 832 | +Assert(seekpos< (off_t)BLCKSZ*RELSEG_SIZE); |
| 833 | + |
| 834 | +nblocks_this_segment= |
| 835 | +Min(nblocks, |
| 836 | +RELSEG_SIZE- (blocknum % ((BlockNumber)RELSEG_SIZE))); |
| 837 | +nblocks_this_segment=Min(nblocks_this_segment,lengthof(iov)); |
| 838 | + |
| 839 | +iovcnt=buffers_to_iovec(iov,buffers,nblocks_this_segment); |
| 840 | +size_this_segment=nblocks_this_segment*BLCKSZ; |
| 841 | +transferred_this_segment=0; |
801 | 842 |
|
802 | 843 | /*
|
803 |
| - * Short read: we are at or past EOF, or we read a partial block at |
804 |
| - * EOF. Normally this is an error; upper levels should never try to |
805 |
| - * read a nonexistent block. However, if zero_damaged_pages is ON or |
806 |
| - * we are InRecovery, we should instead return zeroes without |
807 |
| - * complaining. This allows, for example, the case of trying to |
808 |
| - * update a block that was later truncated away. |
| 844 | + * Inner loop to continue after a short read. We'll keep going until |
| 845 | + * we hit EOF rather than assuming that a short read means we hit the |
| 846 | + * end. |
809 | 847 | */
|
810 |
| -if (zero_damaged_pages||InRecovery) |
811 |
| -MemSet(buffer,0,BLCKSZ); |
812 |
| -else |
813 |
| -ereport(ERROR, |
814 |
| -(errcode(ERRCODE_DATA_CORRUPTED), |
815 |
| -errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", |
816 |
| -blocknum,FilePathName(v->mdfd_vfd), |
817 |
| -nbytes,BLCKSZ))); |
| 848 | +for (;;) |
| 849 | +{ |
| 850 | +TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum,blocknum, |
| 851 | +reln->smgr_rlocator.locator.spcOid, |
| 852 | +reln->smgr_rlocator.locator.dbOid, |
| 853 | +reln->smgr_rlocator.locator.relNumber, |
| 854 | +reln->smgr_rlocator.backend); |
| 855 | +nbytes=FileReadV(v->mdfd_vfd,iov,iovcnt,seekpos, |
| 856 | +WAIT_EVENT_DATA_FILE_READ); |
| 857 | +TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum,blocknum, |
| 858 | +reln->smgr_rlocator.locator.spcOid, |
| 859 | +reln->smgr_rlocator.locator.dbOid, |
| 860 | +reln->smgr_rlocator.locator.relNumber, |
| 861 | +reln->smgr_rlocator.backend, |
| 862 | +nbytes, |
| 863 | +size_this_segment-transferred_this_segment); |
| 864 | + |
| 865 | +#ifdefSIMULATE_SHORT_READ |
| 866 | +nbytes=Min(nbytes,4096); |
| 867 | +#endif |
| 868 | + |
| 869 | +if (nbytes<0) |
| 870 | +ereport(ERROR, |
| 871 | +(errcode_for_file_access(), |
| 872 | +errmsg("could not read blocks %u..%u in file \"%s\": %m", |
| 873 | +blocknum, |
| 874 | +blocknum+nblocks_this_segment-1, |
| 875 | +FilePathName(v->mdfd_vfd)))); |
| 876 | + |
| 877 | +if (nbytes==0) |
| 878 | +{ |
| 879 | +/* |
| 880 | + * We are at or past EOF, or we read a partial block at EOF. |
| 881 | + * Normally this is an error; upper levels should never try to |
| 882 | + * read a nonexistent block. However, if zero_damaged_pages |
| 883 | + * is ON or we are InRecovery, we should instead return zeroes |
| 884 | + * without complaining. This allows, for example, the case of |
| 885 | + * trying to update a block that was later truncated away. |
| 886 | + */ |
| 887 | +if (zero_damaged_pages||InRecovery) |
| 888 | +{ |
| 889 | +for (BlockNumberi=transferred_this_segment /BLCKSZ; |
| 890 | +i<nblocks_this_segment; |
| 891 | +++i) |
| 892 | +memset(buffers[i],0,BLCKSZ); |
| 893 | +break; |
| 894 | +} |
| 895 | +else |
| 896 | +ereport(ERROR, |
| 897 | +(errcode(ERRCODE_DATA_CORRUPTED), |
| 898 | +errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes", |
| 899 | +blocknum, |
| 900 | +blocknum+nblocks_this_segment-1, |
| 901 | +FilePathName(v->mdfd_vfd), |
| 902 | +transferred_this_segment, |
| 903 | +size_this_segment))); |
| 904 | +} |
| 905 | + |
| 906 | +/* One loop should usually be enough. */ |
| 907 | +transferred_this_segment+=nbytes; |
| 908 | +Assert(transferred_this_segment <=size_this_segment); |
| 909 | +if (transferred_this_segment==size_this_segment) |
| 910 | +break; |
| 911 | + |
| 912 | +/* Adjust position and vectors after a short read. */ |
| 913 | +seekpos+=nbytes; |
| 914 | +iovcnt=compute_remaining_iovec(iov,iov,iovcnt,nbytes); |
| 915 | +} |
| 916 | + |
| 917 | +nblocks-=nblocks_this_segment; |
| 918 | +buffers+=nblocks_this_segment; |
| 919 | +blocknum+=nblocks_this_segment; |
818 | 920 | }
|
819 | 921 | }
|
820 | 922 |
|
821 | 923 | /*
|
822 |
| - *mdwrite() -- Write the suppliedblock at the appropriate location. |
| 924 | + *mdwritev() -- Write the suppliedblocks at the appropriate location. |
823 | 925 | *
|
824 | 926 | * This is to be used only for updating already-existing blocks of a
|
825 | 927 | * relation (ie, those before the current EOF). To extend a relation,
|
826 | 928 | * use mdextend().
|
827 | 929 | */
|
828 | 930 | void
|
829 |
| -mdwrite(SMgrRelationreln,ForkNumberforknum,BlockNumberblocknum, |
830 |
| -constvoid*buffer,boolskipFsync) |
| 931 | +mdwritev(SMgrRelationreln,ForkNumberforknum,BlockNumberblocknum, |
| 932 | +constvoid**buffers,BlockNumbernblocks,boolskipFsync) |
831 | 933 | {
|
832 |
| -off_tseekpos; |
833 |
| -intnbytes; |
834 |
| -MdfdVec*v; |
835 |
| - |
836 |
| -/* If this build supports direct I/O, the buffer must be I/O aligned. */ |
837 |
| -if (PG_O_DIRECT!=0&&PG_IO_ALIGN_SIZE <=BLCKSZ) |
838 |
| -Assert((uintptr_t)buffer==TYPEALIGN(PG_IO_ALIGN_SIZE,buffer)); |
839 |
| - |
840 | 934 | /* This assert is too expensive to have on normally ... */
|
841 | 935 | #ifdefCHECK_WRITE_VS_EXTEND
|
842 | 936 | Assert(blocknum<mdnblocks(reln,forknum));
|
843 | 937 | #endif
|
844 | 938 |
|
845 |
| -TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum,blocknum, |
846 |
| -reln->smgr_rlocator.locator.spcOid, |
847 |
| -reln->smgr_rlocator.locator.dbOid, |
848 |
| -reln->smgr_rlocator.locator.relNumber, |
849 |
| -reln->smgr_rlocator.backend); |
| 939 | +while (nblocks>0) |
| 940 | +{ |
| 941 | +structioveciov[PG_IOV_MAX]; |
| 942 | +intiovcnt; |
| 943 | +off_tseekpos; |
| 944 | +intnbytes; |
| 945 | +MdfdVec*v; |
| 946 | +BlockNumbernblocks_this_segment; |
| 947 | +size_ttransferred_this_segment; |
| 948 | +size_tsize_this_segment; |
850 | 949 |
|
851 |
| -v=_mdfd_getseg(reln,forknum,blocknum,skipFsync, |
852 |
| -EXTENSION_FAIL |EXTENSION_CREATE_RECOVERY); |
| 950 | +v=_mdfd_getseg(reln,forknum,blocknum,skipFsync, |
| 951 | +EXTENSION_FAIL |EXTENSION_CREATE_RECOVERY); |
853 | 952 |
|
854 |
| -seekpos= (off_t)BLCKSZ* (blocknum % ((BlockNumber)RELSEG_SIZE)); |
| 953 | +seekpos= (off_t)BLCKSZ* (blocknum % ((BlockNumber)RELSEG_SIZE)); |
855 | 954 |
|
856 |
| -Assert(seekpos< (off_t)BLCKSZ*RELSEG_SIZE); |
| 955 | +Assert(seekpos< (off_t)BLCKSZ*RELSEG_SIZE); |
857 | 956 |
|
858 |
| -nbytes=FileWrite(v->mdfd_vfd,buffer,BLCKSZ,seekpos,WAIT_EVENT_DATA_FILE_WRITE); |
| 957 | +nblocks_this_segment= |
| 958 | +Min(nblocks, |
| 959 | +RELSEG_SIZE- (blocknum % ((BlockNumber)RELSEG_SIZE))); |
| 960 | +nblocks_this_segment=Min(nblocks_this_segment,lengthof(iov)); |
859 | 961 |
|
860 |
| -TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum,blocknum, |
861 |
| -reln->smgr_rlocator.locator.spcOid, |
862 |
| -reln->smgr_rlocator.locator.dbOid, |
863 |
| -reln->smgr_rlocator.locator.relNumber, |
864 |
| -reln->smgr_rlocator.backend, |
865 |
| -nbytes, |
866 |
| -BLCKSZ); |
| 962 | +iovcnt=buffers_to_iovec(iov, (void**)buffers,nblocks_this_segment); |
| 963 | +size_this_segment=nblocks_this_segment*BLCKSZ; |
| 964 | +transferred_this_segment=0; |
867 | 965 |
|
868 |
| -if (nbytes!=BLCKSZ) |
869 |
| -{ |
870 |
| -if (nbytes<0) |
871 |
| -ereport(ERROR, |
872 |
| -(errcode_for_file_access(), |
873 |
| -errmsg("could not write block %u in file \"%s\": %m", |
874 |
| -blocknum,FilePathName(v->mdfd_vfd)))); |
875 |
| -/* short write: complain appropriately */ |
876 |
| -ereport(ERROR, |
877 |
| -(errcode(ERRCODE_DISK_FULL), |
878 |
| -errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", |
879 |
| -blocknum, |
880 |
| -FilePathName(v->mdfd_vfd), |
881 |
| -nbytes,BLCKSZ), |
882 |
| -errhint("Check free disk space."))); |
883 |
| -} |
| 966 | +/* |
| 967 | + * Inner loop to continue after a short write. If the reason is that |
| 968 | + * we're out of disk space, a future attempt should get an ENOSPC |
| 969 | + * error from the kernel. |
| 970 | + */ |
| 971 | +for (;;) |
| 972 | +{ |
| 973 | +TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum,blocknum, |
| 974 | +reln->smgr_rlocator.locator.spcOid, |
| 975 | +reln->smgr_rlocator.locator.dbOid, |
| 976 | +reln->smgr_rlocator.locator.relNumber, |
| 977 | +reln->smgr_rlocator.backend); |
| 978 | +nbytes=FileWriteV(v->mdfd_vfd,iov,iovcnt,seekpos, |
| 979 | +WAIT_EVENT_DATA_FILE_WRITE); |
| 980 | +TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum,blocknum, |
| 981 | +reln->smgr_rlocator.locator.spcOid, |
| 982 | +reln->smgr_rlocator.locator.dbOid, |
| 983 | +reln->smgr_rlocator.locator.relNumber, |
| 984 | +reln->smgr_rlocator.backend, |
| 985 | +nbytes, |
| 986 | +size_this_segment-transferred_this_segment); |
| 987 | + |
| 988 | +#ifdefSIMULATE_SHORT_WRITE |
| 989 | +nbytes=Min(nbytes,4096); |
| 990 | +#endif |
884 | 991 |
|
885 |
| -if (!skipFsync&& !SmgrIsTemp(reln)) |
886 |
| -register_dirty_segment(reln,forknum,v); |
| 992 | +if (nbytes<0) |
| 993 | +{ |
| 994 | +boolenospc=errno==ENOSPC; |
| 995 | + |
| 996 | +ereport(ERROR, |
| 997 | +(errcode_for_file_access(), |
| 998 | +errmsg("could not write blocks %u..%u in file \"%s\": %m", |
| 999 | +blocknum, |
| 1000 | +blocknum+nblocks_this_segment-1, |
| 1001 | +FilePathName(v->mdfd_vfd)), |
| 1002 | +enospc ?errhint("Check free disk space.") :0)); |
| 1003 | +} |
| 1004 | + |
| 1005 | +/* One loop should usually be enough. */ |
| 1006 | +transferred_this_segment+=nbytes; |
| 1007 | +Assert(transferred_this_segment <=size_this_segment); |
| 1008 | +if (transferred_this_segment==size_this_segment) |
| 1009 | +break; |
| 1010 | + |
| 1011 | +/* Adjust position and iovecs after a short write. */ |
| 1012 | +seekpos+=nbytes; |
| 1013 | +iovcnt=compute_remaining_iovec(iov,iov,iovcnt,nbytes); |
| 1014 | +} |
| 1015 | + |
| 1016 | +if (!skipFsync&& !SmgrIsTemp(reln)) |
| 1017 | +register_dirty_segment(reln,forknum,v); |
| 1018 | + |
| 1019 | +nblocks-=nblocks_this_segment; |
| 1020 | +buffers+=nblocks_this_segment; |
| 1021 | +blocknum+=nblocks_this_segment; |
| 1022 | +} |
887 | 1023 | }
|
888 | 1024 |
|
| 1025 | + |
889 | 1026 | /*
|
890 | 1027 | * mdwriteback() -- Tell the kernel to write pages back to storage.
|
891 | 1028 | *
|
|