Uh oh!
There was an error while loading.Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork34k
bpo-33671: efficient zero-copy for shutil.copy* functions (Linux, OSX and Win)#7160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
1a72c0177c4bfa2afa04a542cd173520c6c050a722c1fd38a2ab6317dacc3b629d5881114c4de501c0dd41b4506fdb097364d2bc53a3c8ef7861737f3eecfdf67ce57d4572548eb211da0fe7037296147d0c3bba2cafd80bb2a75fe5025dca36a534e9da3fa9fcc2e74f3224224ad25a24d20e67b6e576b82ddc9b62b61e34e96186b20902abf3ecb91e492ce02c69d73837e228be4c16c59adf700629d077912e62c6568a40a7557ba00856c96d9780fbe6efdf4bcb185f130c8c98ae17bb5e6d8b9bf9b59ac578eefce74fc8c6b3048e3d11102e175452733261b7451c476d729dd231823828a9d6a07e3ce917f81a0ec3e7475b05dd3cf9b549302bec11cc87648f941f7404d28c122149b8b6a02a2a2287508b9da5d5c921f46bb24490fef8b3271be4536035fe28dc651e5d0eadad67cdc5f65c8ae9c4508ebb1fee6566898af43505330c9a5733f362fe17e729bc46f75cabbc02d22ee087a08203ab284e9ac9479dfd77a7e42a597e5008a8de89dd20c0dc4b829b9730a1bed32d9d27a717bd78bb1d49175ce94e407bcef5File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -10,6 +10,7 @@ | ||
| import fnmatch | ||
| import collections | ||
| import errno | ||
| import io | ||
| try: | ||
| import zlib | ||
| @@ -42,6 +43,16 @@ | ||
| except ImportError: | ||
| getgrnam = None | ||
| posix = nt = None | ||
| if os.name == 'posix': | ||
| import posix | ||
| elif os.name == 'nt': | ||
| import nt | ||
| COPY_BUFSIZE = 1024 * 1024 if os.name == 'nt' else 16 * 1024 | ||
| _HAS_SENDFILE = posix and hasattr(os, "sendfile") | ||
| _HAS_FCOPYFILE = posix and hasattr(posix, "_fcopyfile") # OSX | ||
| __all__ = ["copyfileobj", "copyfile", "copymode", "copystat", "copy", "copy2", | ||
| "copytree", "move", "rmtree", "Error", "SpecialFileError", | ||
| "ExecError", "make_archive", "get_archive_formats", | ||
| @@ -72,14 +83,124 @@ class RegistryError(Exception): | ||
| """Raised when a registry operation with the archiving | ||
| and unpacking registries fails""" | ||
| class _GiveupOnFastCopy(Exception): | ||
| """Raised as a signal to fallback on using raw read()/write() | ||
| file copy when fast-copy functions fail to do so. | ||
| """ | ||
| def _fastcopy_osx(fsrc, fdst, flags): | ||
| """Copy a regular file content or metadata by using high-performance | ||
| fcopyfile(3) syscall (OSX). | ||
| """ | ||
| try: | ||
| infd = fsrc.fileno() | ||
| outfd = fdst.fileno() | ||
| except Exception as err: | ||
| raise _GiveupOnFastCopy(err) # not a regular file | ||
| try: | ||
| posix._fcopyfile(infd, outfd, flags) | ||
| except OSError as err: | ||
| err.filename = fsrc.name | ||
| err.filename2 = fdst.name | ||
| if err.errno in {errno.EINVAL, errno.ENOTSUP}: | ||
| raise _GiveupOnFastCopy(err) | ||
| else: | ||
| raise err from None | ||
| def _fastcopy_sendfile(fsrc, fdst): | ||
| """Copy data from one regular mmap-like fd to another by using | ||
| high-performance sendfile(2) syscall. | ||
| This should work on Linux >= 2.6.33 and Solaris only. | ||
| """ | ||
| # Note: copyfileobj() is left alone in order to not introduce any | ||
| # unexpected breakage. Possible risks by using zero-copy calls | ||
| # in copyfileobj() are: | ||
| # - fdst cannot be open in "a"(ppend) mode | ||
| # - fsrc and fdst may be open in "t"(ext) mode | ||
| # - fsrc may be a BufferedReader (which hides unread data in a buffer), | ||
| # GzipFile (which decompresses data), HTTPResponse (which decodes | ||
| # chunks). | ||
| # - possibly others (e.g. encrypted fs/partition?) | ||
| global _HAS_SENDFILE | ||
| try: | ||
| infd = fsrc.fileno() | ||
| outfd = fdst.fileno() | ||
| except Exception as err: | ||
| raise _GiveupOnFastCopy(err) # not a regular file | ||
| # Hopefully the whole file will be copied in a single call. | ||
| # sendfile() is called in a loop 'till EOF is reached (0 return) | ||
| # so a bufsize smaller or bigger than the actual file size | ||
| # should not make any difference, also in case the file content | ||
| # changes while being copied. | ||
| try: | ||
| blocksize = max(os.fstat(infd).st_size, 2 ** 23) # min 8MB | ||
| except Exception: | ||
| blocksize = 2 ** 27 # 128MB | ||
| offset = 0 | ||
| while True: | ||
| try: | ||
| sent = os.sendfile(outfd, infd, offset, blocksize) | ||
| except OSError as err: | ||
| # ...in oder to have a more informative exception. | ||
| err.filename = fsrc.name | ||
| err.filename2 = fdst.name | ||
| if err.errno == errno.ENOTSOCK: | ||
| # sendfile() on this platform (probably Linux < 2.6.33) | ||
| # does not support copies between regular files (only | ||
| # sockets). | ||
| _HAS_SENDFILE = False | ||
| raise _GiveupOnFastCopy(err) | ||
| if err.errno == errno.ENOSPC: # filesystem is full | ||
| raise err from None | ||
| # Give up on first call and if no data was copied. | ||
| if offset == 0 and os.lseek(outfd, 0, os.SEEK_CUR) == 0: | ||
| raise _GiveupOnFastCopy(err) | ||
| raise err | ||
| else: | ||
| if sent == 0: | ||
| break # EOF | ||
| offset += sent | ||
| def _copybinfileobj(fsrc, fdst, length=COPY_BUFSIZE): | ||
| """Copy 2 regular file objects open in binary mode.""" | ||
| # Localize variable access to minimize overhead. | ||
| fsrc_readinto = fsrc.readinto | ||
| fdst_write = fdst.write | ||
| with memoryview(bytearray(length)) as mv: | ||
| while True: | ||
| n = fsrc_readinto(mv) | ||
| if not n: | ||
| break | ||
| elif n < length: | ||
| fdst_write(mv[:n]) | ||
Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. In my test case I used another This ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Uhm... yes, given the big bufsize I think it makes sense to also immediately release the sliced memoryview. Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. @eryksun If I recall correctly, memory views inadvertently keeping large memory buffers alive on GC based implementations was a key driver in adding context management support to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. | ||
| else: | ||
| fdst_write(mv) | ||
| def _is_binary_files_pair(fsrc, fdst): | ||
| return hasattr(fsrc, 'readinto') and \ | ||
| isinstance(fsrc, io.BytesIO) or 'b' in getattr(fsrc, 'mode', '') and \ | ||
Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. Which objects provide ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I think catching ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I benchmarked Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. I don't know how you're testing, but the performance difference with ContributorAuthor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others.Learn more. This is how I'm testing it: $ python -c"import os; f = open('f1', 'wb'); f.write(os.urandom(8 * 1024 * 1024))"$time ./python -m timeit -s'import shutil; p1 = "f1"; p2 = "f2"''shutil.copyfile(p1, p2)' ContributorAuthor
| ||
| isinstance(fdst, io.BytesIO) or 'b' in getattr(fdst, 'mode', '') | ||
| def copyfileobj(fsrc, fdst, length=COPY_BUFSIZE): | ||
| """copy data from file-like object fsrc to file-like object fdst""" | ||
| if _is_binary_files_pair(fsrc, fdst): | ||
| _copybinfileobj(fsrc, fdst, length=length) | ||
| else: | ||
| # Localize variable access to minimize overhead. | ||
| fsrc_read = fsrc.read | ||
| fdst_write = fdst.write | ||
| while 1: | ||
| buf = fsrc_read(length) | ||
| if not buf: | ||
| break | ||
| fdst_write(buf) | ||
| def _samefile(src, dst): | ||
| # Macintosh, Unix. | ||
| @@ -117,9 +238,23 @@ def copyfile(src, dst, *, follow_symlinks=True): | ||
| if not follow_symlinks and os.path.islink(src): | ||
| os.symlink(os.readlink(src), dst) | ||
| else: | ||
| with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: | ||
| if _HAS_SENDFILE: | ||
| try: | ||
| _fastcopy_sendfile(fsrc, fdst) | ||
| return dst | ||
| except _GiveupOnFastCopy: | ||
| pass | ||
| if _HAS_FCOPYFILE: | ||
| try: | ||
| _fastcopy_osx(fsrc, fdst, posix._COPYFILE_DATA) | ||
| return dst | ||
| except _GiveupOnFastCopy: | ||
| pass | ||
| _copybinfileobj(fsrc, fdst) | ||
| return dst | ||
| def copymode(src, dst, *, follow_symlinks=True): | ||
| @@ -244,13 +379,12 @@ def copy(src, dst, *, follow_symlinks=True): | ||
| def copy2(src, dst, *, follow_symlinks=True): | ||
| """Copy data and all stat info ("cp -p src dst"). Return the file's | ||
| destination. | ||
| The destination may be a directory. | ||
| If follow_symlinks is false, symlinks won't be followed. This | ||
| resembles GNU's "cp -P src dst". | ||
| """ | ||
| if os.path.isdir(dst): | ||
| dst = os.path.join(dst, os.path.basename(src)) | ||
| @@ -1015,7 +1149,6 @@ def disk_usage(path): | ||
| elif os.name == 'nt': | ||
| __all__.append('disk_usage') | ||
| _ntuple_diskusage = collections.namedtuple('usage', 'total used free') | ||
Uh oh!
There was an error while loading.Please reload this page.