Uh oh!
There was an error while loading.Please reload this page.
- Notifications
You must be signed in to change notification settings - Fork34k
gh-124951: Optimize base64 encode & decode for an easy 2-3x speedup [no SIMD]#143262
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
2d2be30573eaf3eaac671060dbf51e12273ef388957458c991f8ff744b1245b4b50c73f7b27ee2b13b8130976a9File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -428,6 +428,13 @@ argparse | ||
| inline code when color output is enabled. | ||
| (Contributed by Savannah Ostrowski in :gh:`142390`.) | ||
| base64 & binascii | ||
| ----------------- | ||
| * CPython's underlying base64 implementation now encodes 2x faster and decodes 3x | ||
| faster thanks to simple CPU pipelining optimizations. | ||
gpshead marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| (Contributed by Gregory P. Smith & Serhiy Storchaka in :gh:`143262`.) | ||
| calendar | ||
| -------- | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and | ||
| related codec has been optimized for modern pipelined CPU architectures and | ||
| now performs 2-3x faster across all platforms. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -76,11 +76,12 @@ get_binascii_state(PyObject *module) | ||
| } | ||
| /* Align to 64 bytes for L1 cache line friendliness */ | ||
| static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = { | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, | ||
| -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63, | ||
| 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,64,-1,-1, /* PAD->64 detected by fast path */ | ||
| -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, | ||
| 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, | ||
| -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, | ||
| @@ -101,9 +102,91 @@ static const unsigned char table_a2b_base64[] = { | ||
| /* Max binary chunk size; limited only by available memory */ | ||
| #define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2) | ||
| /* | ||
| * Fast base64 encoding/decoding helpers. | ||
| * | ||
| * Process complete groups without loop-carried dependencies. | ||
| */ | ||
| /* Align to 64 bytes for L1 cache line friendliness */ | ||
| static const unsigned char table_b2a_base64[] Py_ALIGNED(64) = | ||
| "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; | ||
| /* Encode 3 bytes into 4 base64 characters. */ | ||
| static inline void | ||
| base64_encode_trio(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned int combined = ((unsigned int)in[0] << 16) | | ||
| ((unsigned int)in[1] << 8) | | ||
| (unsigned int)in[2]; | ||
| out[0] = table[(combined >> 18) & 0x3f]; | ||
| out[1] = table[(combined >> 12) & 0x3f]; | ||
| out[2] = table[(combined >> 6) & 0x3f]; | ||
| out[3] = table[combined & 0x3f]; | ||
| } | ||
| /* Encode multiple complete 3-byte groups. | ||
| * Returns the number of input bytes processed (always a multiple of 3). | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_encode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_trios = in_len / 3; | ||
| const unsigned char *in_end = in + n_trios * 3; | ||
| while (in < in_end) { | ||
| base64_encode_trio(in, out, table); | ||
| in += 3; | ||
| out += 4; | ||
| } | ||
| return n_trios * 3; | ||
| } | ||
| /* Decode 4 base64 characters into 3 bytes. | ||
| * Returns 1 on success, 0 if any character is invalid. | ||
| */ | ||
| static inline int | ||
| base64_decode_quad(const unsigned char *in, unsigned char *out, | ||
| const unsigned char *table) | ||
| { | ||
| unsigned char v0 = table[in[0]]; | ||
| unsigned char v1 = table[in[1]]; | ||
| unsigned char v2 = table[in[2]]; | ||
| unsigned char v3 = table[in[3]]; | ||
| if ((v0 | v1 | v2 | v3) & 0xc0) { | ||
| return 0; | ||
| } | ||
| out[0] = (v0 << 2) | (v1 >> 4); | ||
| out[1] = (v1 << 4) | (v2 >> 2); | ||
| out[2] = (v2 << 6) | v3; | ||
| return 1; | ||
| } | ||
| /* Decode multiple complete 4-character groups (no padding allowed). | ||
| * Returns the number of input characters processed. | ||
| * Stops at the first invalid character, padding, or incomplete group. | ||
| */ | ||
| static inline Py_ssize_t | ||
| base64_decode_fast(const unsigned char *in, Py_ssize_t in_len, | ||
| unsigned char *out, const unsigned char *table) | ||
| { | ||
| Py_ssize_t n_quads = in_len / 4; | ||
| Py_ssize_t i; | ||
| for (i = 0; i < n_quads; i++) { | ||
| if (!base64_decode_quad(in + i * 4, out + i * 3, table)) { | ||
gpshead marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| break; | ||
| } | ||
| } | ||
| return i * 4; | ||
| } | ||
| static const unsigned short crctab_hqx[256] = { | ||
| 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, | ||
| @@ -403,10 +486,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode) | ||
| goto error_end; | ||
| } | ||
| size_t i = 0; /* Current position in input */ | ||
| /* Fast path: use optimized decoder for complete quads. | ||
| * This works for both strict and non-strict mode for valid input. | ||
| * The fast path stops at padding, invalid chars, or incomplete groups. | ||
| */ | ||
| if (ascii_len >= 4) { | ||
| Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len, | ||
| bin_data, table_a2b_base64); | ||
| if (fast_chars > 0) { | ||
gpshead marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| i = (size_t)fast_chars; | ||
| bin_data += (fast_chars / 4) * 3; | ||
| } | ||
| } | ||
| /* Slow path: handle remaining input (padding, invalid chars, partial groups) */ | ||
| int quad_pos = 0; | ||
| unsigned char leftchar = 0; | ||
| int pads = 0; | ||
| for (; i < ascii_len; i++) { | ||
| unsigned char this_ch = ascii_data[i]; | ||
| /* Check for pad sequences and ignore | ||
| @@ -533,9 +632,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | ||
| /*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/ | ||
| { | ||
| const unsigned char *bin_data; | ||
| Py_ssize_t bin_len; | ||
| binascii_state *state; | ||
| @@ -566,26 +662,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline) | ||
| } | ||
| unsigned char *ascii_data = PyBytesWriter_GetData(writer); | ||
| /* Use the optimized fast path for complete 3-byte groups */ | ||
| Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data, | ||
| table_b2a_base64); | ||
| bin_data += fast_bytes; | ||
| ascii_data += (fast_bytes / 3) * 4; | ||
gpshead marked this conversation as resolved. Show resolvedHide resolvedUh oh!There was an error while loading.Please reload this page. | ||
| bin_len -= fast_bytes; | ||
| /* Handle remaining 0-2 bytes */ | ||
| if (bin_len == 1) { | ||
| /* 1 byte remaining: produces 2 base64 chars + 2 padding */ | ||
| unsigned int val = bin_data[0]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 4) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } | ||
| else if (bin_len == 2) { | ||
| /* 2 bytes remaining: produces 3 base64 chars + 1 padding */ | ||
| unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f]; | ||
| *ascii_data++ = table_b2a_base64[(val << 2) & 0x3f]; | ||
| *ascii_data++ = BASE64_PAD; | ||
| } | ||
| if (newline) | ||
| *ascii_data++ = '\n'; /* Append a courtesy newline */ | ||
Uh oh!
There was an error while loading.Please reload this page.