Jul 19, 2024 · Jul 19, 2024 · Jul 20, 2024 · Jul 24, 2024 · Jul 26, 2024 · Aug 1, 2024
diff --git a/examples_tests b/examples_tests
diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h

 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_

 #include "nbl/builtin/hlsl/cpp_compat.hlsl"

 namespace nbl
 {
 namespace hlsl
 {
 namespace luma_meter
 {

 struct MeteringWindow
 {
 using this_t = MeteringWindow;
 float32_t2 meteringWindowScale;
 float32_t2 meteringWindowOffset;

 static this_t create(float32_t2 scale, float32_t2 offset) {
 this_t retval;
 retval.meteringWindowScale = scale;
 retval.meteringWindowOffset = offset;
 return retval;
 }
 };

 }
 }
 }

 #endif
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
 // Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h

 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_

 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"

 namespace nbl
 {
 namespace hlsl
 {
 namespace luma_meter
 {

 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_meter {
    using float_t = typename SharedAccessor::type;
    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;

    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
    {
        this_t retval;
        retval.lumaMinMax = lumaMinMax;
        retval.sampleCount = sampleCount;
        return retval;
    }

    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
    {
        return workgroup::reduction < plus < float_t >, GroupSize >::
            template __call <SharedAccessor>(value, sdata);
    }

    float_t __computeLumaLog2(
        NBL_CONST_REF_ARG(MeteringWindow) window,
        NBL_REF_ARG(TexAccessor) tex,
        float_t2 shiftedCoord
    )
    {
        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
        float_t3 color = tex.get(uvPos);
        float_t luma = (float_t)TexAccessor::toXYZ(color);

        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);

        return log2(luma);
    }

    void __uploadFloat(
        NBL_REF_ARG(ValueAccessor) val_accessor,
        float_t val,
        float_t minLog2,
        float_t rangeLog2
    )
    {
        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));

        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
    }

    float_t __downloadFloat(
        NBL_REF_ARG(ValueAccessor) val_accessor,
        uint32_t index,
        float_t minLog2,
        float_t rangeLog2
    )
    {
        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
        return luma / rangeLog2 + minLog2;
    }

    void sampleLuma(
        NBL_CONST_REF_ARG(MeteringWindow) window,
        NBL_REF_ARG(ValueAccessor) val,
        NBL_REF_ARG(TexAccessor) tex,
        NBL_REF_ARG(SharedAccessor) sdata,
        float_t2 tileOffset,
        float_t2 viewportSize
    )
    {
        uint32_t tid = workgroup::SubgroupContiguousIndex();
        uint32_t2 coord = {
            morton2d_decode_x(tid),
            morton2d_decode_y(tid)
        };

        float_t luma = 0.0f;
        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);

        if (tid == 0) {
            __uploadFloat(
                val,
                lumaLog2Sum,
                log2(lumaMinMax.x),
                log2(lumaMinMax.y / lumaMinMax.x)
            );
        }
    }

    float_t gatherLuma(
        NBL_REF_ARG(ValueAccessor) val
    )
    {
        uint32_t tid = glsl::gl_SubgroupInvocationID();
        float_t luma = glsl::subgroupAdd(
            __downloadFloat(
                val,
                tid,
                log2(lumaMinMax.x),
                log2(lumaMinMax.y / lumaMinMax.x)
            )
        );

        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
    }

    float_t sampleCount;
    float_t2 lumaMinMax;
 };

 template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
 struct median_meter {
    using int_t = typename SharedAccessor::type;
    using float_t  = float32_t;
    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;

    static this_t create(float_t2 lumaMinMax) {
        this_t retval;
        retval.lumaMinMax = lumaMinMax;
        return retval;
    }

    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
            template __call <SharedAccessor>(value, sdata);
    }

    float_t __computeLuma(
        NBL_CONST_REF_ARG(MeteringWindow) window,
        NBL_REF_ARG(TexAccessor) tex,
        float_t2 shiftedCoord
    ) {
        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
        float_t3 color = tex.get(uvPos);
        float_t luma = (float_t)TexAccessor::toXYZ(color);

        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
    }

    int_t __float2Int(
        float_t val,
        float_t minLog2,
        float_t rangeLog2
    ) {
        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
    }

    float_t __int2Float(
        int_t val,
        float_t minLog2,
        float_t rangeLog2
    ) {
        return val / rangeLog2 + minLog2;
    }

    void sampleLuma(
        NBL_CONST_REF_ARG(MeteringWindow) window,
        NBL_REF_ARG(HistogramAccessor) histo,
        NBL_REF_ARG(TexAccessor) tex,
        NBL_REF_ARG(SharedAccessor) sdata,
        float_t2 tileOffset,
        float_t2 viewportSize
    ) {
        uint32_t tid = workgroup::SubgroupContiguousIndex();

        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
            sdata.set(vid, 0);
        }

        sdata.workgroupExecutionAndMemoryBarrier();

        uint32_t2 coord = {
            morton2d_decode_x(tid),
            morton2d_decode_y(tid)
        };

        float_t luma = 0.0f;
        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
        luma = __computeLuma(window, tex, shiftedCoord);

        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);

        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

        sdata.workgroupExecutionAndMemoryBarrier();

        float_t histogram_value;
        sdata.get(tid, histogram_value);

        sdata.workgroupExecutionAndMemoryBarrier();

        float_t sum = inclusive_scan(histogram_value, sdata);
        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

        const bool is_last_wg_invocation = tid == (GroupSize - 1);
        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;

        for (int i = 1; i < RoundedBinCount; i++) {
            uint32_t keyBucketStart = GroupSize * i;
            uint32_t vid = tid + keyBucketStart;

            // no if statement about the last iteration needed
            if (is_last_wg_invocation) {
                float_t beforeSum;
                sdata.get(keyBucketStart, beforeSum);
                sdata.set(keyBucketStart, beforeSum + sum);
            }

            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
            sdata.workgroupExecutionAndMemoryBarrier();

            // no aliasing anymore
            float_t atVid;
            sdata.get(vid, atVid);
            sum = inclusive_scan(atVid, sdata);
            if (vid < BinCount) {
                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
            }
        }
    }

    float_t gatherLuma(
        NBL_REF_ARG(HistogramAccessor) histo,
        NBL_REF_ARG(SharedAccessor) sdata
    ) {
        uint32_t tid = workgroup::SubgroupContiguousIndex();

        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
            sdata.set(
                vid,
                histo.get(vid & (BinCount - 1))
            );
        }

        sdata.workgroupExecutionAndMemoryBarrier();

        uint32_t percentile40, percentile60;
        sdata.get(BinCount * 0.4, percentile40);
        sdata.get(BinCount * 0.6, percentile60);

        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
    }

    float_t2 lumaMinMax;
 };

 }
 }
 }

 #endif
+0 −12		23_Autoexposure/CMakeLists.txt
+0 −177		23_Autoexposure/main.cpp
+25 −0		26_Autoexposure/CMakeLists.txt
+68 −0		26_Autoexposure/app_resources/avg_luma_meter.comp.hlsl
+88 −0		26_Autoexposure/app_resources/avg_luma_tonemap.comp.hlsl
+28 −0		26_Autoexposure/app_resources/common.hlsl
+72 −0		26_Autoexposure/app_resources/median_luma_meter.comp.hlsl
+93 −0		26_Autoexposure/app_resources/median_luma_tonemap.comp.hlsl
+20 −0		26_Autoexposure/app_resources/present.frag.hlsl
+0 −0		26_Autoexposure/config.json.template
+1,134 −0		26_Autoexposure/main.cpp
+0 −0		26_Autoexposure/pipeline.groovy
+1 −0		CMakeLists.txt
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,35 @@
		// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
		// This file is part of the "Nabla Engine".
		// For conditions of distribution and use, see copyright notice in nabla.h

		#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
		#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_

		#include "nbl/builtin/hlsl/cpp_compat.hlsl"

		namespace nbl
		{
		namespace hlsl
		{
		namespace luma_meter
		{

		struct MeteringWindow
		{
		using this_t = MeteringWindow;
		float32_t2 meteringWindowScale;
		float32_t2 meteringWindowOffset;

		static this_t create(float32_t2 scale, float32_t2 offset) {
		this_t retval;
		retval.meteringWindowScale = scale;
		retval.meteringWindowOffset = offset;
		return retval;
		}
		};

		}
		}
		}

		#endif
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,287 @@
		// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
		// This file is part of the "Nabla Engine".
		// For conditions of distribution and use, see copyright notice in nabla.h

		#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
		#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_

		#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
		#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
		#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
		#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
		#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
		#include "nbl/builtin/hlsl/type_traits.hlsl"
		#include "nbl/builtin/hlsl/math/morton.hlsl"
		#include "nbl/builtin/hlsl/luma_meter/common.hlsl"

		namespace nbl
		{
		namespace hlsl
		{
		namespace luma_meter
		{

		template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
		struct geom_meter {
		using float_t = typename SharedAccessor::type;
		using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. even if doing color computation in`float16_t` this doesn't free you from doing texture coordinate calc in`float32_t`
		using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
		using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;

		static this_t create(float_t2 lumaMinMax, float_t sampleCount)
		{
		this_t retval;
		retval.lumaMinMax = lumaMinMax;
		retval.sampleCount = sampleCount;
		return retval;
		}

		float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
		{
		return workgroup::reduction < plus < float_t >, GroupSize >::
		template __call <SharedAccessor>(value, sdata);
		}

		float_t __computeLumaLog2(
		NBL_CONST_REF_ARG(MeteringWindow) window,
		NBL_REF_ARG(TexAccessor) tex,
		float_t2 shiftedCoord
		)
		{
		float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
		float_t3 color = tex.get(uvPos);
		float_t luma = (float_t)TexAccessor::toXYZ(color);

		luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);

		return log2(luma);
		}

		void __uploadFloat(
		NBL_REF_ARG(ValueAccessor) val_accessor,
		float_t val,
		float_t minLog2,
		float_t rangeLog2
Comment on lines +63 to +64 Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. should already be precomputed as members
		)
		{
		uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart
		uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
		uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

		uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
Comment on lines +69 to +71 Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. lets write some docs for this.... The`val` was produced by a workgroup reduction is performed of values in the`[MinLog2,MaxLog2]` range Which makes the`scaledLogLuma` (the variable that should hold`(val-minLog2)rangeLog2`) is between 0 and WorkGroupSize This value is atomic added by N workgroups You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize. This means that we need to multiply the by`(2^32-1)/N` precomputed as a float or if you must round up`N` to PoT and see how many bits are left (512 workgroups, means 9 bits, so 23 are left). To avoid rounding precision errors, the PoT method is chosen. I have no clue where you're getting`+SubgroupSizeLog2` from.* Also the value of`(1<<fixedPointBitsLeft)-1` must be precomputed in`create` and stored as a member IT should be as easy as constuint32_t scaledLumaLog2BitPattern =uint32_t((val-lumaMinLog2)maxIncrement_over_lumaRangeLog2+float_t(0.5)); where`maxIncrement = (0x1u<<(32u-uint32_t(ceil(log2(WorkGroupCountWorkGroupSize)))))-1;`

		val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
		}

		float_t __downloadFloat(
		NBL_REF_ARG(ValueAccessor) val_accessor,
		uint32_t index,
		float_t minLog2,
		float_t rangeLog2
		)
		{
		float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
		return luma / rangeLog2 + minLog2;
Comment on lines +83 to +84 Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025• edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. again, you're getting random floats based on workgroup index whichthankfully was always the same (rare case of two wrongs making a right) Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them just converting to`float_t` is not the correct way to decode, you should divide by the`maxIncrement`
		}

		void sampleLuma(
		NBL_CONST_REF_ARG(MeteringWindow) window,
		NBL_REF_ARG(ValueAccessor) val,
		NBL_REF_ARG(TexAccessor) tex,
		NBL_REF_ARG(SharedAccessor) sdata,
		float_t2 tileOffset,
Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. why is tile Offset being provided from the outside? its a byproduct of your workgroupID, and workgroupSize-1 decoded as morton +1 in each dimension
		float_t2 viewportSize
		)
		{
		uint32_t tid = workgroup::SubgroupContiguousIndex();
		uint32_t2 coord = {
		morton2d_decode_x(tid),
		morton2d_decode_y(tid)
		};

		float_t luma = 0.0f;
		float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
		float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
		float_t lumaLog2Sum = __reduction(lumaLog2, sdata);

		if (tid == 0) {
		__uploadFloat(
		val,
		lumaLog2Sum,
		log2(lumaMinMax.x),
		log2(lumaMinMax.y / lumaMinMax.x)
		);
		}
		}

		float_t gatherLuma(
		NBL_REF_ARG(ValueAccessor) val
		)
		{
		uint32_t tid = glsl::gl_SubgroupInvocationID();
		float_t luma = glsl::subgroupAdd(
		__downloadFloat(
		val,
		tid,
		log2(lumaMinMax.x),
		log2(lumaMinMax.y / lumaMinMax.x)
		)
		);

		uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
		uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
Comment on lines +131 to +132 Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. you're supposed to normalize by the number of samples you took during the sampling step, your`workGroupCount` here is NOT that value, its the number of workgroups you're exposing with Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. You must precompute the`fixedPointsBitsLeft` in the`create` method (and it needs to know how many invocations you'll be running the sample step)

		return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
		}

		float_t sampleCount;
Comment on lines +134 to +137 Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. you want to compute and store the reciprocal of`sampleCount` and the`1<<fixedPointBitsLeft` Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. that was the purpose of the`rcpFirstPassWGCount` variable in the old GLSL
		float_t2 lumaMinMax;
Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. don't do weird things we used to do in GLSL (due to no scalar layout), have a separate variable for min and max Copy link MemberAuthor devshgraphicsprogrammingMar 17, 2025 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. also you should have the min and max precomputed with`log2` already applied
		};

		template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
		struct median_meter {
		using int_t = typename SharedAccessor::type;
		using float_t = float32_t;
		using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
		using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
		using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;

		static this_t create(float_t2 lumaMinMax) {
		this_t retval;
		retval.lumaMinMax = lumaMinMax;
		return retval;
		}

		int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
		return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
		template __call <SharedAccessor>(value, sdata);
		}

		float_t __computeLuma(
		NBL_CONST_REF_ARG(MeteringWindow) window,
		NBL_REF_ARG(TexAccessor) tex,
		float_t2 shiftedCoord
		) {
		float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
		float_t3 color = tex.get(uvPos);
		float_t luma = (float_t)TexAccessor::toXYZ(color);

		return clamp(luma, lumaMinMax.x, lumaMinMax.y);
		}

		int_t __float2Int(
		float_t val,
		float_t minLog2,
		float_t rangeLog2
		) {
		uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
		uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

		return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
		}

		float_t __int2Float(
		int_t val,
		float_t minLog2,
		float_t rangeLog2
		) {
		return val / rangeLog2 + minLog2;
		}

		void sampleLuma(
		NBL_CONST_REF_ARG(MeteringWindow) window,
		NBL_REF_ARG(HistogramAccessor) histo,
		NBL_REF_ARG(TexAccessor) tex,
		NBL_REF_ARG(SharedAccessor) sdata,
		float_t2 tileOffset,
		float_t2 viewportSize
		) {
		uint32_t tid = workgroup::SubgroupContiguousIndex();

		for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
		sdata.set(vid, 0);
		}

		sdata.workgroupExecutionAndMemoryBarrier();

		uint32_t2 coord = {
		morton2d_decode_x(tid),
		morton2d_decode_y(tid)
		};

		float_t luma = 0.0f;
		float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
		luma = __computeLuma(window, tex, shiftedCoord);

		float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
		uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);

		sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

		sdata.workgroupExecutionAndMemoryBarrier();

		float_t histogram_value;
		sdata.get(tid, histogram_value);

		sdata.workgroupExecutionAndMemoryBarrier();

		float_t sum = inclusive_scan(histogram_value, sdata);
		histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));

		const bool is_last_wg_invocation = tid == (GroupSize - 1);
		const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;

		for (int i = 1; i < RoundedBinCount; i++) {
		uint32_t keyBucketStart = GroupSize * i;
		uint32_t vid = tid + keyBucketStart;

		// no if statement about the last iteration needed
		if (is_last_wg_invocation) {
		float_t beforeSum;
		sdata.get(keyBucketStart, beforeSum);
		sdata.set(keyBucketStart, beforeSum + sum);
		}

		// propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
		sdata.workgroupExecutionAndMemoryBarrier();

		// no aliasing anymore
		float_t atVid;
		sdata.get(vid, atVid);
		sum = inclusive_scan(atVid, sdata);
		if (vid < BinCount) {
		histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
		}
		}
		}

		float_t gatherLuma(
		NBL_REF_ARG(HistogramAccessor) histo,
		NBL_REF_ARG(SharedAccessor) sdata
		) {
		uint32_t tid = workgroup::SubgroupContiguousIndex();

		for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
		sdata.set(
		vid,
		histo.get(vid & (BinCount - 1))
		);
		}

		sdata.workgroupExecutionAndMemoryBarrier();

		uint32_t percentile40, percentile60;
		sdata.get(BinCount * 0.4, percentile40);
		sdata.get(BinCount * 0.6, percentile60);

		return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
		}

		float_t2 lumaMinMax;
		};

		}
		}
		}

		#endif