"""Demo script for the inplace float mutation optimization in the tier 2 optimizer../configure --enable-experimental-jit=interpreter --with-pydebug./configure --enable-experimental-jit=yes --with-pydebugUsage: ./python jit_float_demo.py # filtered trace (float-related ops) ./python jit_float_demo.py --all # full trace (all ops)"""import sysimport timeitSHOW_ALL = "--all" in sys.argv# --- System info ---print("=" * 60)print("CPython JIT / Tier 2 Demo")print("=" * 60)print(f"Python version: {sys.version}")print(f"Debug build: {hasattr(sys, 'gettotalrefcount')}")print(f"Free-threaded: {not sys._is_gil_enabled()}")jit_mod = getattr(sys, "_jit", None)if jit_mod is not None: print(f"JIT available: {jit_mod.is_available()}") print(f"JIT enabled: {jit_mod.is_enabled()}")else: print("JIT: not compiled in")tier2 = Falsetry: from _testinternalcapi import TIER2_THRESHOLD tier2 = True print(f"Tier 2: enabled (threshold={TIER2_THRESHOLD})")except (ImportError, AttributeError): print("Tier 2: disabled (build without _Py_TIER2)")print()# --- Example functions ---def f_adds(n, a, b, c): """a*b + c per iteration — the multiply result is unique and reused.""" total = 0.0 for i in range(n): total = a + b + c return totaldef f_chain(n, a, b, c): """a*b + c per iteration — the multiply result is unique and reused.""" total = 0.0 for i in range(n): total += a * b + c return totaldef f_simple_add(n, a, b): """a + b per iteration — no unique intermediate.""" total = 0.0 for i in range(n): total += a + b return totaldef f_long_chain(n, a, b, c, d): """a*b + c*d per iteration — two unique intermediates.""" total = 0.0 for i in range(n): total += a * b + c * d return totaldef f_negate(n, a, b): """a*b + c*d per iteration — two unique intermediates.""" total = 0.0 for i in range(n): total = - (a + b) return total# --- Warm up to trigger tier 2 ---LOOP = 10_000f_adds(LOOP, 2.0, 3.0, 4.0)f_chain(LOOP, 2.0, 3.0, 4.0)f_simple_add(LOOP, 2.0, 3.0)f_long_chain(LOOP, 2.0, 3.0, 4.0, 5.0)# --- Op annotation ---def annotate_op(name, oparg, func): """Return a human-readable annotation for a uop.""" varnames = func.__code__.co_varnames consts = func.__code__.co_consts # _LOAD_FAST_BORROW_3 → local index 3 for prefix in ("_LOAD_FAST_BORROW_", "_LOAD_FAST_", "_SWAP_FAST_"): if name.startswith(prefix): idx = int(name[len(prefix):]) local = varnames[idx] if idx < len(varnames) else f"local{idx}" return local # _LOAD_CONST_INLINE_BORROW etc — operand is a pointer, not useful # but if oparg is a small index into consts, show it if "LOAD_CONST" in name and oparg < len(consts): return repr(consts[oparg]) # Binary ops if "MULTIPLY" in name: return "*" if "SUBTRACT" in name: return "-" if "ADD" in name and "UNICODE" not in name: return "+" # Guards if name == "_GUARD_TOS_FLOAT": return "top is float?" if name == "_GUARD_NOS_FLOAT": return "2nd is float?" if name == "_GUARD_TOS_INT": return "top is int?" if name == "_GUARD_NOS_INT": return "2nd is int?" if "NOT_EXHAUSTED" in name: return "iter not done?" # Pop / cleanup if name == "_POP_TOP_NOP": return "skip (borrowed/null)" if name == "_POP_TOP_FLOAT": return "decref float" if name == "_POP_TOP_INT": return "decref int" # Control flow if name == "_JUMP_TO_TOP": return "loop" if name == "_EXIT_TRACE": return "exit" if name == "_DEOPT": return "deoptimize" if name == "_ERROR_POP_N": return "error handler" if name == "_START_EXECUTOR": return "trace entry" if name == "_MAKE_WARM": return "warmup counter" return ""# --- Show traces ---FILTER_KEYWORDS = ( "FLOAT", "INPLACE", "BINARY_OP", "NOP", "LOAD_FAST", "LOAD_CONST", "GUARD",)has_get_executor = Falseif tier2: try: from _opcode import get_executor has_get_executor = True except ImportError: passif has_get_executor: mode = "all ops" if SHOW_ALL else "float-related ops only" print("-" * 60) print(f"Tier 2 traces ({mode})") print("-" * 60) for label, func in [ ("f_adds: total = a + b + c", f_adds), ("f_chain: total += a * b + c", f_chain), ("f_simple_add: total += a + b", f_simple_add), ("f_long_chain: total += a * b + c * d", f_long_chain), ]: code = func.__code__ found = False for i in range(len(code.co_code) // 2): try: ex = get_executor(code, i * 2) except (ValueError, TypeError, RuntimeError): continue if ex is None: continue print(f"\n {label}") for j, op in enumerate(ex): name, oparg = op[0], op[1] if not SHOW_ALL: if not any(k in name for k in FILTER_KEYWORDS): continue annotation = annotate_op(name, oparg, func) marker = " <<<" if "INPLACE" in name else "" if annotation: print(f" {j:3d}: {name:45s} # {annotation}{marker}") else: print(f" {j:3d}: {name}{marker}") found = True break if not found: print(f"\n {label}: (no executor found)") print()else: print("-" * 60) print("Tier 2 traces: skipped (tier 2 not available)") print("-" * 60) print()# --- Benchmark ---print("-" * 60)print("Benchmark")print("-" * 60)N = 2_000_000INNER = 1000benchmarks = [ ("total = a + b + c", lambda: f_adds(INNER, 2.0, 3.0, 4.0)), ("total += a*b + c ", lambda: f_chain(INNER, 2.0, 3.0, 4.0)), ("total += a + b ", lambda: f_simple_add(INNER, 2.0, 3.0)), ("total += a*b + c*d", lambda: f_long_chain(INNER, 2.0, 3.0, 4.0, 5.0)), ("total = - (a + b) ", lambda: f_negate(INNER, 2.0, 3.0)),]for label, fn in benchmarks: iters = N // INNER t = timeit.timeit(fn, number=iters) ns_per = t / N * 1e9 print(f" {label}: {t:.3f}s ({ns_per:.0f} ns/iter)")print()print("The 'a*b + c' case benefits from _BINARY_OP_ADD_FLOAT_INPLACE:")print("the result of a*b is uniquely referenced, so the addition")print("mutates it in place instead of allocating a new float.")# --- N-body benchmark from pyperformance ---print()print("-" * 60)print("N-body benchmark (from pyperformance)")print("-" * 60)PI = 3.14159265358979323SOLAR_MASS = 4 * PI * PIDAYS_PER_YEAR = 365.24def _nbody_make_system(): bodies = [ # sun ([0.0, 0.0, 0.0], [0.0, 0.0, 0.0], SOLAR_MASS), # jupiter ([4.84143144246472090e+00, -1.16032004402742839e+00, -1.03622044471123109e-01], [1.66007664274403694e-03*DAYS_PER_YEAR, 7.69901118419740425e-03*DAYS_PER_YEAR, -6.90460016972063023e-05*DAYS_PER_YEAR], 9.54791938424326609e-04*SOLAR_MASS), # saturn ([8.34336671824457987e+00, 4.12479856412430479e+00, -4.03523417114321381e-01], [-2.76742510726862411e-03*DAYS_PER_YEAR, 4.99852801234917238e-03*DAYS_PER_YEAR, 2.30417297573763929e-05*DAYS_PER_YEAR], 2.85885980666130812e-04*SOLAR_MASS), # uranus ([1.28943695621391310e+01, -1.51111514016986312e+01, -2.23307578892655734e-01], [2.96460137564761618e-03*DAYS_PER_YEAR, 2.37847173959480950e-03*DAYS_PER_YEAR, -2.96589568540237556e-05*DAYS_PER_YEAR], 4.36624404335156298e-05*SOLAR_MASS), # neptune ([1.53796971148509165e+01, -2.59193146099879641e+01, 1.79258772950371181e-01], [2.68067772490389322e-03*DAYS_PER_YEAR, 1.62824170038242295e-03*DAYS_PER_YEAR, -9.51592254519715870e-05*DAYS_PER_YEAR], 5.15138902046611451e-05*SOLAR_MASS), ] pairs = [] for x in range(len(bodies) - 1): for y in bodies[x + 1:]: pairs.append((bodies[x], y)) return bodies, pairsdef _nbody_advance(dt, n, bodies, pairs): for i in range(n): for (([x1, y1, z1], v1, m1), ([x2, y2, z2], v2, m2)) in pairs: dx = x1 - x2 dy = y1 - y2 dz = z1 - z2 mag = dt * ((dx * dx + dy * dy + dz * dz) ** (-1.5)) b1m = m1 * mag b2m = m2 * mag v1[0] -= dx * b2m v1[1] -= dy * b2m v1[2] -= dz * b2m v2[0] += dx * b1m v2[1] += dy * b1m v2[2] += dz * b1m for (r, [vx, vy, vz], m) in bodies: r[0] += dt * vx r[1] += dt * vy r[2] += dt * vzdef _nbody_report_energy(bodies, pairs, e=0.0): for (((x1, y1, z1), v1, m1), ((x2, y2, z2), v2, m2)) in pairs: dx = x1 - x2 dy = y1 - y2 dz = z1 - z2 e -= (m1 * m2) / ((dx * dx + dy * dy + dz * dz) ** 0.5) for (r, [vx, vy, vz], m) in bodies: e += m * (vx * vx + vy * vy + vz * vz) / 2. return edef _nbody_offset_momentum(ref, bodies, px=0.0, py=0.0, pz=0.0): for (r, [vx, vy, vz], m) in bodies: px -= vx * m py -= vy * m pz -= vz * m (r, v, m) = ref v[0] = px / m v[1] = py / m v[2] = pz / mdef bench_nbody(iterations=20000): bodies, pairs = _nbody_make_system() _nbody_offset_momentum(bodies[0], bodies) _nbody_report_energy(bodies, pairs) _nbody_advance(0.01, iterations, bodies, pairs) _nbody_report_energy(bodies, pairs)# Warmupbench_nbody(1000)NBODY_RUNS = 5t = timeit.timeit(lambda: bench_nbody(20000), number=NBODY_RUNS)print(f" nbody (20k iterations, {NBODY_RUNS} runs): {t/NBODY_RUNS*1000:.1f} ms/run")
Uh oh!
There was an error while loading.Please reload this page.
We can add the following tier 2 micro-ops that mutate the uniquely-referenced operand:
_BINARY_OP_ADD_FLOAT_INPLACE/_INPLACE_RIGHT— unique LHS / RHS_BINARY_OP_SUBTRACT_FLOAT_INPLACE/_INPLACE_RIGHT— unique LHS / RHS_BINARY_OP_MULTIPLY_FLOAT_INPLACE/_INPLACE_RIGHT— unique LHS / RHS_UNARY_NEGATIVE_FLOAT_INPLACE— unique operandThe
_RIGHTvariants handle commutative ops (add, multiply) plus subtract when only the RHS is unique. The optimizer emits these inoptimizer_bytecodes.cwhenPyJitRef_IsUnique(left)orPyJitRef_IsUnique(right)is true and the operand is a known float. The mutated operand is marked as borrowed so the following_POP_TOPbecomes_POP_TOP_NOP.Micro-benchmarks:
total += a*b + ctotal += a + btotal += a*b + c*dpyperformance nbody (20k iterations):
Followup
Some operations that could be added in followup PR's: division of floats, operations on a float and an int with a uniquely referenced float, integer operations (but this is move involved because of small ints and number of digits), operations with complex numbers.
Script