Dec 24, 2023 · Dec 24, 2023 · Dec 25, 2023 · Dec 25, 2023 · Dec 29, 2023 · Dec 29, 2023
diff --git a/Doc/library/importlib.rst b/Doc/library/importlib.rst
      :func:`importlib.util.spec_from_loader` may be useful for implementing
      concrete ``MetaPathFinders``.

      *Fullname* must be normalized in NFKC to match the normalization
      done by the Python parser.

      .. versionadded:: 3.4

   .. method:: invalidate_caches()
      guess about what spec to return. :func:`importlib.util.spec_from_loader`
      may be useful for implementing concrete ``PathEntryFinders``.

      *Fullname* must be normalized in NFKC to match the normalization
      done by the Python parser.

      .. versionadded:: 3.4

   .. method:: invalidate_caches()
diff --git a/Lib/importlib/_bootstrap.py b/Lib/importlib/_bootstrap.py
    except AttributeError:
        return type(obj).__qualname__

 def _normalize(name):
    """Normalize 'name' to NKFC form."""
    global _unicodedata_normalize
    if _unicodedata_normalize is None:
        from unicodedata import normalize as _unicodedata_normalize
    return _unicodedata_normalize('NFKC', name)

 # Bootstrap-related code ######################################################

 # Modules injected manually by _setup()
 # Import done by _install_external_importers()
 _bootstrap_external = None

 # Import done lazily as needed by _normalize as unicodedata is not built-in.
 _unicodedata_normalize = None

 def _wrap(new, old):
    """Simple substitute for functools.update_wrapper."""
    the loader did not.

    """
    global _unicodedata
    _sanity_check(name, package, level)

    if not name.isascii():
        name = _normalize(name)

    if package is not None and not package.isascii():
        package = _normalize(package)

    if level > 0:
        name = _resolve_name(name, package, level)
    return _find_and_load(name, _gcd_import)
diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py
        else:
            self.path = _path_abspath(path)
        self._path_mtime = -1
        self._path_cache = set()
        self._relaxed_path_cache = set()
        self._path_cache = {}
        self._relaxed_path_cache = {}
        self._cache_is_normalized = False

    def invalidate_caches(self):
        """Invalidate the directory mtime."""
            self._fill_cache()
            self._path_mtime = mtime
        # tail_module keeps the original casing, for __file__ and friends
        if not tail_module.isascii() and not self._cache_is_normalized:
            self._normalize_cache()
        if _relax_case():
            cache = self._relaxed_path_cache
            cache_module = tail_module.lower()
        else:
            cache = self._path_cache
            cache_module = tail_module
        # Check if the module is the name of a directory (and thus a package).
        if cache_module in cache:
            base_path = _path_join(self.path, tail_module)
        try:
            cache_path = cache[cache_module]
        except KeyError:
            pass
        else:
            base_path = _path_join(self.path, cache_path)
            for suffix, loader_class in self._loaders:
                init_filename = '__init__' + suffix
                full_path = _path_join(base_path, init_filename)
                is_namespace = _path_isdir(base_path)
        # Check for a file w/ a proper suffix exists.
        for suffix, loader_class in self._loaders:
            # XXX: Why is ValueError caught here?
            #try:
            #    full_path = _path_join(self.path, tail_module + suffix)
            #except ValueError:
            #    return None
            _bootstrap._verbose_message('trying {}{} in {}', cache_module, suffix, self.path, verbosity=2)
            try:
                full_path = _path_join(self.path, tail_module + suffix)
            except ValueError:
                return None
            _bootstrap._verbose_message('trying {}', full_path, verbosity=2)
            if cache_module + suffix in cache:
                cache_path = cache[cache_module + suffix]
            except KeyError:
                pass

            else:
                full_path = _path_join(self.path, cache_path)
                if _path_isfile(full_path):
                    return self._get_spec(loader_class, fullname, full_path,
                                          None, target)
                    return self._get_spec(loader_class, fullname, full_path, None, target)
        if is_namespace:
            _bootstrap._verbose_message('possible namespace for {}', base_path)
            spec = _bootstrap.ModuleSpec(fullname, None)
        # We store two cached versions, to handle runtime changes of the
        # PYTHONCASEOK environment variable.
        if not sys.platform.startswith('win'):
            self._path_cache =set(contents)
            self._path_cache ={ p: p for p incontents }
        else:
            # Windows users can import modules with case-insensitive file
            # suffixes (for legacy reasons). Make the suffix lowercase here
            # so it's done once instead of for every import. This is safe as
            # the specified suffixes to check against are always specified in a
            # case-sensitive manner.
            lower_suffix_contents =set()
            lower_suffix_contents ={}
            for item in contents:
                name, dot, suffix = item.partition('.')
                if dot:
                    new_name = f'{name}.{suffix.lower()}'
                else:
                    new_name = name
                lower_suffix_contents.add(new_name)
                lower_suffix_contents[new_name] = item
            self._path_cache = lower_suffix_contents
        if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS):
            self._relaxed_path_cache = {fn.lower() for fn in contents}
            self._relaxed_path_cache = {fn.lower(): fn for fn in contents}

        self._cache_is_normalized = False

    def _normalize_cache(self):
        """Normalize all entries in the caches to NFKC."""
        from unicodedata import normalize

        self._path_cache = { normalize('NFKC', p): p for p in self._path_cache }
        self._relaxed_path_cache = { normalize('NFKC', p): p for p in self._relaxed_path_cache }
        self._cache_is_normalized = True


    @classmethod
    def path_hook(cls, *loader_details):
diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py
 import threading
 import time
 import types
 import unicodedata
 import unittest
 from unittest import mock
 import _imp
        start = 2900 + sys.version_info.minor * 50
        self.assertIn(magic_number, range(start, start + 50))

 class TestImportAccented(unittest.TestCase):
    # XXX: There should be tests with PYTHONCASEOK as well
    #      (for those platforms where this is relevant)
    dir_name = os.path.abspath(TESTFN)

    def setUp(self):
        self.sys_path = sys.path[:]
        os.mkdir(self.dir_name)
        sys.path.insert(0, self.dir_name)
        importlib.invalidate_caches()

    def tearDown(self):
        sys.path[:] = self.sys_path
        importlib.invalidate_caches()
        rmtree(self.dir_name)

    def assert_importing_possible(self, name):
        normalized = unicodedata.normalize('NFKC', name)
        filename = os.path.join(self.dir_name, f"{name}.py")
        with open(filename, "w") as stream:
            stream.write("SPAM = 'spam'\n")

        values = {}
        exec(f"from {name} import SPAM", values, values)
        try:
            self.assertEqual(values["SPAM"], "spam")
            self.assertIn(normalized, sys.modules)
        finally:
            del sys.modules[normalized]

    def test_import_precomposed(self):
        name = 'M\u00E4dchen'
        self.assert_importing_possible(name)

    def test_import_normalized(self):
        name = 'M\u0061\u0308dchen'
        self.assert_importing_possible(name)

    def test_import_macos_input(self):
        name = 'Mädchen'
        self.assert_importing_possible(name)

 if __name__ == '__main__':
    # Test needs to be a package, so we can do relative imports.
diff --git a/Lib/test/test_importlib/source/test_case_sensitivity.py b/Lib/test/test_importlib/source/test_case_sensitivity.py
            self.assertIsNotNone(sensitive)
            self.assertIn(self.name, sensitive.get_filename(self.name))
            self.assertIsNotNone(insensitive)
            self.assertIn(self.name, insensitive.get_filename(self.name))
            self.assertIn(self.name.lower(), insensitive.get_filename(self.name))


 class CaseSensitivityTestPEP451(CaseSensitivityTest):
diff --git a/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst b/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst
 It is now possible to import modules from the filesystem regardless of how
 the name is normalized in the filesystem. This in particular affects
 importing modules with a name that contains accented latin characters.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -256,6 +256,9 @@ ABC hierarchy::
		:func:`importlib.util.spec_from_loader` may be useful for implementing
		concrete ``MetaPathFinders``.

		Fullname must be normalized in NFKC to match the normalization
		done by the Python parser.

		.. versionadded:: 3.4

		.. method:: invalidate_caches()
Expand DownExpand Up		@@ -290,6 +293,9 @@ ABC hierarchy::
		guess about what spec to return. :func:`importlib.util.spec_from_loader`
		may be useful for implementing concrete ``PathEntryFinders``.

		Fullname must be normalized in NFKC to match the normalization
		done by the Python parser.

		.. versionadded:: 3.4

		.. method:: invalidate_caches()
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -26,6 +26,13 @@ def _object_name(obj):
		except AttributeError:
		return type(obj).__qualname__

		def _normalize(name):
		"""Normalize 'name' to NKFC form."""
		global _unicodedata_normalize
		if _unicodedata_normalize is None:
		from unicodedata import normalize as _unicodedata_normalize
		return _unicodedata_normalize('NFKC', name)

		# Bootstrap-related code ######################################################

		# Modules injected manually by _setup()
Expand All		@@ -36,6 +43,8 @@ def _object_name(obj):
		# Import done by _install_external_importers()
		_bootstrap_external = None

		# Import done lazily as needed by _normalize as unicodedata is not built-in.
		_unicodedata_normalize = None

		def _wrap(new, old):
		"""Simple substitute for functools.update_wrapper."""
Expand DownExpand Up		@@ -1392,7 +1401,15 @@ def _gcd_import(name, package=None, level=0):
		the loader did not.

		"""
		global _unicodedata
		_sanity_check(name, package, level)

		if not name.isascii():
		name = _normalize(name)

		if package is not None and not package.isascii():
		package = _normalize(package)

		if level > 0:
		name = _resolve_name(name, package, level)
		return _find_and_load(name, _gcd_import)
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1345,8 +1345,9 @@ def __init__(self, path, *loader_details):
		else:
		self.path = _path_abspath(path)
		self._path_mtime = -1
		self._path_cache = set()
		self._relaxed_path_cache = set()
		self._path_cache = {}
		self._relaxed_path_cache = {}
		self._cache_is_normalized = False

		def invalidate_caches(self):
		"""Invalidate the directory mtime."""
Expand All		@@ -1372,15 +1373,21 @@ def find_spec(self, fullname, target=None):
		self._fill_cache()
		self._path_mtime = mtime
		# tail_module keeps the original casing, for __file__ and friends
		if not tail_module.isascii() and not self._cache_is_normalized:
		self._normalize_cache()
Comment on lines +1376 to +1377 Copy link Member ericsnowcurrentlyDec 28, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. For the sake of`__import__()` and`importlib.import_module()` we will probably need to either normalize`cache_module` or preserve the original name in the cache. The former sounds expensive so the latter might be the way to go. Copy link Member ericsnowcurrentlyDec 28, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Regardless, we should probably have some test cases that use one of those functions rather than syntax. Copy link ContributorAuthor ronaldoussorenDec 29, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. I changed the code to keep the unnormalised name around, we need the unnormalised name to access the filesystem and need the normalised name to compare to`tail_module` (which will generally be normalised by the compiler). I agree that there should be more test cases, I started out with just the minimal test from the issue to see if I can get this to work without bootstrapping issues and get some feedback. I'll definitely add more tests when this approach is seen as acceptable by@brettcannon (as the importlib expert). ericsnowcurrently reacted with thumbs up emoji Copy link Member brettcannonFeb 23, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. If the cache always keeps the raw name then can we drop`_cache_is_normalized`?
		if _relax_case():
		cache = self._relaxed_path_cache
		cache_module = tail_module.lower()
		else:
		cache = self._path_cache
		cache_module = tail_module
		# Check if the module is the name of a directory (and thus a package).
		if cache_module in cache:
		base_path = _path_join(self.path, tail_module)
		try:
		cache_path = cache[cache_module]
		except KeyError:
		pass
		else:
		base_path = _path_join(self.path, cache_path)
		for suffix, loader_class in self._loaders:
		init_filename = '__init__' + suffix
		full_path = _path_join(base_path, init_filename)
Expand All		@@ -1392,15 +1399,21 @@ def find_spec(self, fullname, target=None):
		is_namespace = _path_isdir(base_path)
		# Check for a file w/ a proper suffix exists.
		for suffix, loader_class in self._loaders:
		# XXX: Why is ValueError caught here?
Copy link ContributorAuthor ronaldoussorenDec 29, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. This change worries me a little because I don't understand why the original code catches`ValueError` and then bails out. Removing this like I did here does not result in test failures. Copy link Member brettcannonFeb 23, 2024 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. If`git blame` isn't helpful then it can probably be removed. Copy link Member serhiy-storchakaMay 21, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. UnicodeEncodeError if the name is not compatible with the filesystem encoding, ValueError for embedded null characters?
		#try:
		# full_path = _path_join(self.path, tail_module + suffix)
		#except ValueError:
		# return None
		_bootstrap._verbose_message('trying {}{} in {}', cache_module, suffix, self.path, verbosity=2)
		try:
		full_path = _path_join(self.path, tail_module + suffix)
		except ValueError:
		return None
		_bootstrap._verbose_message('trying {}', full_path, verbosity=2)
		if cache_module + suffix in cache:
		cache_path = cache[cache_module + suffix]
		except KeyError:
		pass

		else:
		full_path = _path_join(self.path, cache_path)
		if _path_isfile(full_path):
		return self._get_spec(loader_class, fullname, full_path,
		None, target)
		return self._get_spec(loader_class, fullname, full_path, None, target)
		if is_namespace:
		_bootstrap._verbose_message('possible namespace for {}', base_path)
		spec = _bootstrap.ModuleSpec(fullname, None)
Expand All		@@ -1420,24 +1433,35 @@ def _fill_cache(self):
		# We store two cached versions, to handle runtime changes of the
		# PYTHONCASEOK environment variable.
		if not sys.platform.startswith('win'):
		self._path_cache =set(contents)
		self._path_cache ={ p: p for p incontents }
		else:
		# Windows users can import modules with case-insensitive file
		# suffixes (for legacy reasons). Make the suffix lowercase here
		# so it's done once instead of for every import. This is safe as
		# the specified suffixes to check against are always specified in a
		# case-sensitive manner.
		lower_suffix_contents =set()
		lower_suffix_contents ={}
		for item in contents:
		name, dot, suffix = item.partition('.')
		if dot:
		new_name = f'{name}.{suffix.lower()}'
		else:
		new_name = name
		lower_suffix_contents.add(new_name)
		lower_suffix_contents[new_name] = item
		self._path_cache = lower_suffix_contents
		if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS):
		self._relaxed_path_cache = {fn.lower() for fn in contents}
		self._relaxed_path_cache = {fn.lower(): fn for fn in contents}

		self._cache_is_normalized = False

		def _normalize_cache(self):
		"""Normalize all entries in the caches to NFKC."""
		from unicodedata import normalize

		self._path_cache = { normalize('NFKC', p): p for p in self._path_cache }
		self._relaxed_path_cache = { normalize('NFKC', p): p for p in self._relaxed_path_cache }
		self._cache_is_normalized = True


		@classmethod
		def path_hook(cls, *loader_details):
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -23,6 +23,7 @@
		import threading
		import time
		import types
		import unicodedata
		import unittest
		from unittest import mock
		import _imp
Expand DownExpand Up		@@ -3372,6 +3373,47 @@ def test_magic_number_endianness(self):
		start = 2900 + sys.version_info.minor * 50
		self.assertIn(magic_number, range(start, start + 50))

		class TestImportAccented(unittest.TestCase):
		# XXX: There should be tests with PYTHONCASEOK as well
		# (for those platforms where this is relevant)
		dir_name = os.path.abspath(TESTFN)

		def setUp(self):
		self.sys_path = sys.path[:]
		os.mkdir(self.dir_name)
		sys.path.insert(0, self.dir_name)
		importlib.invalidate_caches()

		def tearDown(self):
		sys.path[:] = self.sys_path
		importlib.invalidate_caches()
		rmtree(self.dir_name)

		def assert_importing_possible(self, name):
		normalized = unicodedata.normalize('NFKC', name)
		filename = os.path.join(self.dir_name, f"{name}.py")
		with open(filename, "w") as stream:
		stream.write("SPAM = 'spam'\n")

		values = {}
		exec(f"from {name} import SPAM", values, values)
		try:
		self.assertEqual(values["SPAM"], "spam")
		self.assertIn(normalized, sys.modules)
		finally:
		del sys.modules[normalized]

		def test_import_precomposed(self):
		name = 'M\u00E4dchen'
		self.assert_importing_possible(name)

		def test_import_normalized(self):
		name = 'M\u0061\u0308dchen'
		self.assert_importing_possible(name)

		def test_import_macos_input(self):
		name = 'Mädchen'
		self.assert_importing_possible(name)

		if __name__ == '__main__':
		# Test needs to be a package, so we can do relative imports.
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -59,7 +59,7 @@ def test_insensitive(self):
		self.assertIsNotNone(sensitive)
		self.assertIn(self.name, sensitive.get_filename(self.name))
		self.assertIsNotNone(insensitive)
		self.assertIn(self.name, insensitive.get_filename(self.name))
		self.assertIn(self.name.lower(), insensitive.get_filename(self.name))


		class CaseSensitivityTestPEP451(CaseSensitivityTest):
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		It is now possible to import modules from the filesystem regardless of how
		the name is normalized in the filesystem. This in particular affects
		importing modules with a name that contains accented latin characters.
Comment on lines +1 to +3 Copy link ContributorAuthor ronaldoussorenDec 29, 2023 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. This probably also needs a What's New entry, lets worry about that when the PR is closer to acceptance to avoid creating merge conflicts.