Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD]#143262

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Merged
gpshead merged 13 commits intopython:mainfromgpshead:opt-base64-cpu
Jan 2, 2026
Merged
Show file tree
Hide file tree
Changes fromall commits
Commits
Show all changes
13 commits
Select commitHold shift + click to select a range
2d2be30
Add base64 benchmark tool and optimize encoding/decoding
gpsheadDec 28, 2025
573eaf3
Fix MSVC build: move table_b2a_base64 before inline functions
gpsheadDec 29, 2025
eaac671
NEWS entry
gpsheadDec 29, 2025
060dbf5
Add a whatsnew entry
gpsheadDec 29, 2025
1e12273
expose the benchmark defaults in the help text.
gpsheadDec 29, 2025
ef38895
Align base64 tables to 64-byte cache line boundaries
gpsheadDec 29, 2025
7458c99
Use BASE64_PAD macro instead of literal '=' in fast path
gpsheadDec 29, 2025
1f8ff74
Simplify block comment for base64 helpers
gpsheadDec 29, 2025
4b1245b
Remove binasciibench in favor of pyperformance bm_base64 addition PR.
gpsheadDec 29, 2025
4b50c73
reword L1 cache line comment
gpsheadJan 2, 2026
f7b27ee
doc Contributed by
gpsheadJan 2, 2026
2b13b81
Remove redundant PAD check in base64 decode fast path
gpsheadJan 2, 2026
30976a9
Use pointer increment in base64_encode_fast
gpsheadJan 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletionsDoc/whatsnew/3.15.rst
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -428,6 +428,13 @@ argparse
inline code when color output is enabled.
(Contributed by Savannah Ostrowski in :gh:`142390`.)

base64 & binascii
-----------------

* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
faster thanks to simple CPU pipelining optimizations.
(Contributed by Gregory P. Smith & Serhiy Storchaka in :gh:`143262`.)

calendar
--------

Expand Down
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
related codec has been optimized for modern pipelined CPU architectures and
now performs 2-3x faster across all platforms.
147 changes: 124 additions & 23 deletionsModules/binascii.c
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -76,11 +76,12 @@ get_binascii_state(PyObject *module)
}


static const unsigned char table_a2b_base64[] = {
/* Align to 64 bytes for L1 cache line friendliness */
static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1, 0,-1,-1, /*NotePAD->0 */
52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by fast path */
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
-1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
Expand All@@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = {
/* Max binary chunk size; limited only by available memory */
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)

static const unsigned char table_b2a_base64[] =
/*
* Fast base64 encoding/decoding helpers.
*
* Process complete groups without loop-carried dependencies.
*/

/* Align to 64 bytes for L1 cache line friendliness */
static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/* Encode 3 bytes into 4 base64 characters. */
static inline void
base64_encode_trio(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned int combined = ((unsigned int)in[0] << 16) |
((unsigned int)in[1] << 8) |
(unsigned int)in[2];
out[0] = table[(combined >> 18) & 0x3f];
out[1] = table[(combined >> 12) & 0x3f];
out[2] = table[(combined >> 6) & 0x3f];
out[3] = table[combined & 0x3f];
}

/* Encode multiple complete 3-byte groups.
* Returns the number of input bytes processed (always a multiple of 3).
*/
static inline Py_ssize_t
base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_trios = in_len / 3;
const unsigned char *in_end = in + n_trios * 3;

while (in < in_end) {
base64_encode_trio(in, out, table);
in += 3;
out += 4;
}

return n_trios * 3;
}

/* Decode 4 base64 characters into 3 bytes.
* Returns 1 on success, 0 if any character is invalid.
*/
static inline int
base64_decode_quad(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned char v0 = table[in[0]];
unsigned char v1 = table[in[1]];
unsigned char v2 = table[in[2]];
unsigned char v3 = table[in[3]];

if ((v0 | v1 | v2 | v3) & 0xc0) {
return 0;
}

out[0] = (v0 << 2) | (v1 >> 4);
out[1] = (v1 << 4) | (v2 >> 2);
out[2] = (v2 << 6) | v3;
return 1;
}

/* Decode multiple complete 4-character groups (no padding allowed).
* Returns the number of input characters processed.
* Stops at the first invalid character, padding, or incomplete group.
*/
static inline Py_ssize_t
base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_quads = in_len / 4;
Py_ssize_t i;

for (i = 0; i < n_quads; i++) {
if (!base64_decode_quad(in + i * 4, out + i * 3, table)) {
break;
}
}

return i * 4;
}


static const unsigned short crctab_hqx[256] = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
Expand DownExpand Up@@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
goto error_end;
}

size_t i = 0; /* Current position in input */

/* Fast path: use optimized decoder for complete quads.
* This works for both strict and non-strict mode for valid input.
* The fast path stops at padding, invalid chars, or incomplete groups.
*/
if (ascii_len >= 4) {
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
bin_data, table_a2b_base64);
if (fast_chars > 0) {
i = (size_t)fast_chars;
bin_data += (fast_chars / 4) * 3;
}
}

/* Slow path: handle remaining input (padding, invalid chars, partial groups) */
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
for (size_t i = 0; i < ascii_len; i++) {
for (; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];

/* Check for pad sequences and ignore
Expand DownExpand Up@@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
/*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
{
const unsigned char *bin_data;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t bin_len;
binascii_state *state;

Expand DownExpand Up@@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
}
unsigned char *ascii_data = PyBytesWriter_GetData(writer);

for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
/* Shift the data into our buffer */
leftchar = (leftchar << 8) | *bin_data;
leftbits += 8;

/* See if there are 6-bit groups ready */
while ( leftbits >= 6 ) {
this_ch = (leftchar >> (leftbits-6)) & 0x3f;
leftbits -= 6;
*ascii_data++ = table_b2a_base64[this_ch];
}
}
if ( leftbits == 2 ) {
*ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
/* Use the optimized fast path for complete 3-byte groups */
Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
table_b2a_base64);
bin_data += fast_bytes;
ascii_data += (fast_bytes / 3) * 4;
bin_len -= fast_bytes;

/* Handle remaining 0-2 bytes */
if (bin_len == 1) {
/* 1 byte remaining: produces 2 base64 chars + 2 padding */
unsigned int val = bin_data[0];
*ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
*ascii_data++ = BASE64_PAD;
*ascii_data++ = BASE64_PAD;
} else if ( leftbits == 4 ) {
*ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
}
else if (bin_len == 2) {
/* 2 bytes remaining: produces 3 base64 chars + 1 padding */
unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
*ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
*ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
*ascii_data++ = BASE64_PAD;
}

if (newline)
*ascii_data++ = '\n'; /* Append a courtesy newline */

Expand Down
Loading

[8]ページ先頭

©2009-2026 Movatter.jp