Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026 · Mar 3, 2026
diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c
    if (code >= 0x110000)
        index = 0;
    else {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
        index = unicodedata_record_get_record_index(code);
    }

    return &_PyUnicode_Database_Records[index];
    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
        index = decomp_index1[(code>>DECOMP_SHIFT)];
        index = decomp_index2[(index<<DECOMP_SHIFT)+
                             (code&((1<<DECOMP_SHIFT)-1))];
        index = unicodedata_decomp_get_decomp_index(code);
    }

    /* high byte is number of hex bytes (usually one or two), low byte
        *index = 0;
    }
    else {
        *index = decomp_index1[(code>>DECOMP_SHIFT)];
        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
                               (code&((1<<DECOMP_SHIFT)-1))];
        *index = unicodedata_decomp_get_decomp_index(code);
    }

    /* high byte is number of hex bytes (usually one or two), low byte
    const void *data;
    Py_UCS4 *output;
    Py_ssize_t i, i1, o, len;
    int f,l,index,index1,comb;
    int f,l,index,comb;
    Py_UCS4 code;
    Py_ssize_t skipped[20];
    int cskipped = 0;
              continue;
          }
          index = f*TOTAL_LAST + l;
          index1 = comp_index[index >> COMP_SHIFT];
          code = comp_data[(index1<<COMP_SHIFT)+
                           (index&((1<<COMP_SHIFT)-1))];
          code = unicodedata_comp_get_comp_data(index);
          if (code == 0)
              goto not_combinable;

    }

    /* get position of codepoint in order of names in the dawg */
    offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
    offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
                               (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
    offset = unicodename_get_dawg_codepoint_pos(code);
    if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
        return 0;

diff --git a/Modules/unicodedata_db.h b/Modules/unicodedata_db.h
diff --git a/Modules/unicodename_db.h b/Modules/unicodename_db.h
diff --git a/Objects/unicodectype.c b/Objects/unicodectype.c

    if (code >= 0x110000)
        index = 0;
    else
    {
        index = index1[(code>>SHIFT)];
        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
    else {
        index = unicodetype_get_type_index(code);
    }

    return &_PyUnicode_TypeRecords[index];

    return (ctype->flags & ALPHA_MASK) != 0;
 }

diff --git a/Objects/unicodetype_db.h b/Objects/unicodetype_db.h
diff --git a/Tools/unicode/benchmark_unicodedata_category.py b/Tools/unicode/benchmark_unicodedata_category.py
 #!/usr/bin/env python3
 """Benchmark Python-level unicodedata.category() lookups.

 Runs three fixed workloads:
 - all Unicode code points
 - BMP only
 - ASCII only
 """

 from __future__ import annotations

 import statistics
 import time
 import unicodedata


 LOOPS = 5
 SAMPLES = 7
 DATASETS = {
    "all": "".join(map(chr, range(0x110000))),
    "bmp": "".join(map(chr, range(0x10000))),
    "ascii": "".join(map(chr, range(0x80))),
 }


 def run_once(chars: str) -> tuple[float, int]:
    category = unicodedata.category
    checksum = 0
    t0 = time.perf_counter()
    for _ in range(LOOPS):
        for ch in chars:
            gc = category(ch)
            checksum += ord(gc[0]) + ord(gc[1])
    elapsed = time.perf_counter() - t0
    return elapsed, checksum


 def benchmark(name: str, chars: str) -> None:
    lookups = len(chars) * LOOPS

    # Warm up specialization and caches before timing.
    run_once(chars)

    samples = []
    checksum = None
    for _ in range(SAMPLES):
        elapsed, checksum = run_once(chars)
        samples.append(elapsed)

    best = min(samples)
    median = statistics.median(samples)
    mean = statistics.fmean(samples)

    print(f"dataset: {name}")
    print(f"codepoints: {len(chars)}")
    print(f"lookups/sample: {lookups}")
    print(f"checksum: {checksum}")
    print(f"best_s: {best:.6f}")
    print(f"median_s: {median:.6f}")
    print(f"mean_s: {mean:.6f}")
    print(f"best_ns_per_lookup: {best * 1e9 / lookups:.2f}")
    print(f"median_ns_per_lookup: {median * 1e9 / lookups:.2f}")
    print()


 def main() -> None:
    print(f"python: {unicodedata.unidata_version=}")
    print(f"samples: {SAMPLES}")
    print(f"loops: {LOOPS}")
    print()

    benchmark("all", DATASETS["all"])
    benchmark("bmp", DATASETS["bmp"])
    benchmark("ascii", DATASETS["ascii"])


 if __name__ == "__main__":
    main()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -89,8 +89,7 @@ _getrecord_ex(Py_UCS4 code)
		if (code >= 0x110000)
		index = 0;
		else {
		index = index1[(code>>SHIFT)];
		index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
		index = unicodedata_record_get_record_index(code);
		}

		return &_PyUnicode_Database_Records[index];
Expand DownExpand Up		@@ -493,9 +492,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
		if (code < 0 \|\| code >= 0x110000)
		index = 0;
		else {
		index = decomp_index1[(code>>DECOMP_SHIFT)];
		index = decomp_index2[(index<<DECOMP_SHIFT)+
		(code&((1<<DECOMP_SHIFT)-1))];
		index = unicodedata_decomp_get_decomp_index(code);
		}

		/* high byte is number of hex bytes (usually one or two), low byte
Expand DownExpand Up		@@ -539,9 +536,7 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
		*index = 0;
		}
		else {
		*index = decomp_index1[(code>>DECOMP_SHIFT)];
		index = decomp_index2[(index<<DECOMP_SHIFT)+
		(code&((1<<DECOMP_SHIFT)-1))];
		*index = unicodedata_decomp_get_decomp_index(code);
		}

		/* high byte is number of hex bytes (usually one or two), low byte
Expand DownExpand Up		@@ -711,7 +706,7 @@ nfc_nfkc(PyObject self, PyObject input, int k)
		const void *data;
		Py_UCS4 *output;
		Py_ssize_t i, i1, o, len;
		int f,l,index,index1,comb;
		int f,l,index,comb;
		Py_UCS4 code;
		Py_ssize_t skipped[20];
		int cskipped = 0;
Expand DownExpand Up		@@ -810,9 +805,7 @@ nfc_nfkc(PyObject self, PyObject input, int k)
		continue;
		}
		index = f*TOTAL_LAST + l;
		index1 = comp_index[index >> COMP_SHIFT];
		code = comp_data[(index1<<COMP_SHIFT)+
		(index&((1<<COMP_SHIFT)-1))];
		code = unicodedata_comp_get_comp_data(index);
		if (code == 0)
		goto not_combinable;

Expand DownExpand Up		@@ -1396,9 +1389,7 @@ _getucname(PyObject *self,
		}

		/* get position of codepoint in order of names in the dawg */
		offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
		offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
		(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
		offset = unicodename_get_dawg_codepoint_pos(code);
		if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
		return 0;

Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -48,10 +48,8 @@ gettyperecord(Py_UCS4 code)

		if (code >= 0x110000)
		index = 0;
		else
		{
		index = index1[(code>>SHIFT)];
		index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
		else {
		index = unicodetype_get_type_index(code);
		}

		return &_PyUnicode_TypeRecords[index];
Expand DownExpand Up		@@ -285,4 +283,3 @@ int _PyUnicode_IsAlpha(Py_UCS4 ch)

		return (ctype->flags & ALPHA_MASK) != 0;
		}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,78 @@
		#!/usr/bin/env python3
		"""Benchmark Python-level unicodedata.category() lookups.

		Runs three fixed workloads:
		- all Unicode code points
		- BMP only
		- ASCII only
		"""

		from __future__ import annotations

		import statistics
		import time
		import unicodedata


		LOOPS = 5
		SAMPLES = 7
		DATASETS = {
		"all": "".join(map(chr, range(0x110000))),
		"bmp": "".join(map(chr, range(0x10000))),
		"ascii": "".join(map(chr, range(0x80))),
		}


		def run_once(chars: str) -> tuple[float, int]:
		category = unicodedata.category
		checksum = 0
		t0 = time.perf_counter()
		for _ in range(LOOPS):
		for ch in chars:
		gc = category(ch)
		checksum += ord(gc[0]) + ord(gc[1])
		elapsed = time.perf_counter() - t0
		return elapsed, checksum


		def benchmark(name: str, chars: str) -> None:
		lookups = len(chars) * LOOPS

		# Warm up specialization and caches before timing.
		run_once(chars)

		samples = []
		checksum = None
		for _ in range(SAMPLES):
		elapsed, checksum = run_once(chars)
		samples.append(elapsed)

		best = min(samples)
		median = statistics.median(samples)
		mean = statistics.fmean(samples)

		print(f"dataset: {name}")
		print(f"codepoints: {len(chars)}")
		print(f"lookups/sample: {lookups}")
		print(f"checksum: {checksum}")
		print(f"best_s: {best:.6f}")
		print(f"median_s: {median:.6f}")
		print(f"mean_s: {mean:.6f}")
		print(f"best_ns_per_lookup: {best * 1e9 / lookups:.2f}")
		print(f"median_ns_per_lookup: {median * 1e9 / lookups:.2f}")
		print()


		def main() -> None:
		print(f"python: {unicodedata.unidata_version=}")
		print(f"samples: {SAMPLES}")
		print(f"loops: {LOOPS}")
		print()

		benchmark("all", DATASETS["all"])
		benchmark("bmp", DATASETS["bmp"])
		benchmark("ascii", DATASETS["ascii"])


		if __name__ == "__main__":
		main()