Feb 13, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/.github/workflows/build-test-python-package.yml b/.github/workflows/build-test-python-package.yml
            - uses: actions/checkout@v4
            - uses: fjwillemsen/setup-nox2@v3.0.0
            - run: |
                  nox
                  nox -- ${{ runner.os }}
            - name: Store benchmark result
              uses: benchmark-action/github-action-benchmark@v1
              with:
                  tool: "pytest"
                  output-file-path: .benchmarks/benchmark_${{ runner.os }}_3.13.json
                  gh-pages-branch: main
                  benchmark-data-dir-path: docs/benchmarks
                  fail-on-alert: true
                  # GitHub API token to make a commit comment
                  github-token: ${{ secrets.GITHUB_TOKEN }}
                  comment-on-alert: true
                  comment-always: true
                  #   alert-comment-cc-users: '@fjwillemsen' mention a GitHub user in the comment
            - name: Report to Coveralls
              uses: coverallsapp/github-action@v2
              with:
                  file: coverage.xml
                  format: cobertura
                  fail-on-error: false
diff --git a/.gitignore b/.gitignore
 pip-delete-this-directory.txt

 # Unit test / coverage reports
 .benchmarks
 htmlcov/
 .tox/
 .coverage
diff --git a/docs/benchmarks/.gitkeep b/docs/benchmarks/.gitkeep
diff --git a/noxfile.py b/noxfile.py

 import nox
 from nox import Session, session
 from pathlib import Path

 # from nox_poetry import Session, session   # nox_poetry is a better option, but <=1.0.3 has a bug with filename-URLs

 nox.options.stop_on_first_error = True
 nox.options.error_on_missing_interpreters = True

 # create the benchmark folder
 Path(".benchmarks").mkdir(exist_ok=True)


 # Test code quality: linting
 @session
 # do not forget check / set the versions with `pyenv global`, or `pyenv local` in case of virtual environment
 def tests(session: Session) -> None:
    """Run the tests for the specified Python versions."""
    # get command line arguments
    if session.posargs:
        os_name = session.posargs[0]
    else:
        os_name = 'local'

    # install the dev-dependencies and build the package
    session.install("poetry")
    session.run("poetry", "install", "--with", "dev,test", external=True)
    # session.poetry.installroot(distribution_format="sdist")

    # run pytest on the package with C-extensions, disable required coverage percentage
    session.run("pytest", "--no-cov")
    session.run("pytest", "--no-cov", "--benchmark-json", f".benchmarks/benchmark_{os_name}_{session.python}.json")

    # for the last Python version session:
    if session.python == python_versions_to_test[-1]:
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
 optional = true
 [tool.poetry.group.test.dependencies]
 pytest = "^8.3.3"
 pytest-benchmark = "^5.1.0"
 pytest-cov = "^6.0.0"
 nox = "^2024.10.9"
 ruff = "^0.7.2"
diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py
 from random import random
 from time import perf_counter
 import pytest
 from constraint import Problem
 from math import sqrt


 # reference times (using A4000 on DAS6)
 reference_microbenchmark_mean = [0.3784186691045761, 0.4737640768289566, 0.10726054509480794, 0.10744890073935191, 0.10979799057046573, 0.15360217044750848, 0.14483965436617532, 0.054416230569283165, 0.13835338006416956, 0.1371802551050981]    # noqa E501
 reference_results = {
    "microhh": 1.1565620,
    "dedispersion": 0.1171140,
    "hotspot": 2.6839208,
 }
 # device properties (for A4000 on DAS6 using get_opencl_device_info.cpp)
 dev = {
    "max_threads": 1024,
    "max_threads_per_sm": 1024,
    "max_threads_per_block": 1536,
    "max_shared_memory_per_block": 49152,
    "max_shared_memory": 102400,
    "max_wi_size": [1024, 1024, 64],
    "max_wg_size": 1024,
 }
 # collect benchmark times
 benchmark_results = dict()

 @pytest.mark.skip
 def get_performance_factor(repeats=3):
    """Run microbenchmarks to indicate how much slower this system is compared to the reference."""

    def cpu_1():
        """Matrix multiplication"""
        size = 100
        A = [[random() for _ in range(size)] for _ in range(size)]
        B = [[random() for _ in range(size)] for _ in range(size)]
        result = [[sum(A[i][k] * B[k][j] for k in range(size)) for j in range(size)] for i in range(size)]
        return result

    def cpu_2():
        """Element-wise arithmetic"""
        N = 10**6
        A = [random() for _ in range(N)]
        B = [random() for _ in range(N)]
        return [A[i] + B[i] for i in range(N)]

    def cpu_3():
        """Addition"""
        N = 10**6
        return [i + i for i in range(N)]

    def cpu_4():
        """Multiplication"""
        N = 10**6
        return [i * i for i in range(N)]

    def cpu_5():
        """Division"""
        N = 10**6
        return [i / i for i in range(1, N+1)]

    def mem_1():
        """Array copying"""
        N = 10**6
        A = [random() for _ in range(N)]
        return A.copy()

    def mem_2():
        """Array slicing"""
        N = 10**6
        A = [random() for _ in range(N)]
        return A[::2]

    def mem_3():
        """Dictionary lookup"""
        N = 10**3
        keys = list(range(N))
        values = list(range(N))
        lst = list(zip(keys, values))
        return [next((v for k, v in lst if k == i), None) for i in range(N)]

    def cache_1():
        """Sequential array sum"""
        N = 10**6
        A = [random() for _ in range(N)]
        return sum(A)

    def cache_2():
        """Strided array sum"""
        N = 10**6
        A = [random() for _ in range(N)]
        return sum(A[::2])

    # run the benchmarks
    benchmarks = [cpu_1, cpu_2, cpu_3, cpu_4, cpu_5, mem_1, mem_2, mem_3, cache_1, cache_2]
    raw_data = [list() for _ in range(repeats)]
    for i in range(repeats):
        for f in benchmarks:
            start = perf_counter()
            f()
            duration = perf_counter() - start
            raw_data[i].append(duration)

    # non-Numpy implementation of statistics calculation
    transposed_data = list(zip(*raw_data))  # transpose the raw_data to get columns as rows

    # calculate mean along axis=0 (column-wise) (`benchmark_data.mean(axis=0)`)
    benchmark_mean = [sum(column) / len(column) for column in transposed_data]

    # calculate standard deviation along axis=0 (column-wise)
    def stddev(column, mean):
        variance = sum((x - mean) ** 2 for x in column) / len(column)
        return sqrt(variance)

    # calculate relative standard deviation (`(benchmark_data.std(axis=0) / abs(np_benchmark_mean))`)
    benchmark_std = [stddev(column, mean) for column, mean in zip(transposed_data, benchmark_mean)]
    relative_std = [(s / abs(m)) if m != 0 else 0 for s, m in zip(benchmark_std, benchmark_mean)]

    # calculate mean relative standard deviation and apply threshold (`max(np.mean(np_relative_std), 0.125)`)
    mean_relative_std = max(sum(relative_std) / len(relative_std), 0.125)

    # calculate performance factor  (`np.mean(np_benchmark_mean / reference_microbenchmark_mean)`)
    performance_factor = sum(bm / rm for bm, rm in zip(benchmark_mean, reference_microbenchmark_mean)) / len(benchmark_mean)
    return performance_factor, mean_relative_std

 performance_factor, mean_relative_std = get_performance_factor()
 print(f"\nSystem performance factor: {round(performance_factor, 3)}")

 @pytest.mark.skip
 def check_benchmark_performance(benchmark_name, mean, std):
    """Utility function to check whether the performance of a benchmark is within the expected range and print information."""
    reference_result = reference_results[benchmark_name]
    assert  mean - std * 2 <= reference_result * (performance_factor + mean_relative_std * 2)
    print(f"Reference: {round(reference_result, 3)}, benchmark: {round(mean, 3)}, expected: {round(reference_result * performance_factor, 3)}")


 def test_microhh(benchmark):
    """Based on the MicroHH search space in the paper."""
    benchmark_name = "microhh"

    cta_padding = 0  # default argument

    # setup the tunable parameters
    problem = Problem()
    problem.addVariable("STATIC_STRIDES", [0])
    problem.addVariable("TILING_STRATEGY", [0])
    problem.addVariable("REWRITE_INTERP", [0])
    problem.addVariable("BLOCK_SIZE_X", [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024])
    problem.addVariable("BLOCK_SIZE_Y", [1, 2, 4, 8, 16, 32])
    problem.addVariable("BLOCK_SIZE_Z", [1, 2, 4])
    problem.addVariable("TILING_FACTOR_X", [1, 2, 4, 8])
    problem.addVariable("TILING_FACTOR_Y", [1, 2, 4])
    problem.addVariable("TILING_FACTOR_Z", [1, 2, 4])
    problem.addVariable("LOOP_UNROLL_FACTOR_X",[1, 2, 4, 8])
    problem.addVariable("LOOP_UNROLL_FACTOR_Y", [1, 2, 4])
    problem.addVariable("LOOP_UNROLL_FACTOR_Z", [1, 2, 4])
    problem.addVariable("BLOCKS_PER_MP", [0, 1, 2, 3, 4])

    # setup the restrictions
    problem.addConstraint([
        f"BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z * BLOCKS_PER_MP <= {dev['max_threads_per_sm']}",
        f"32 <= BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z <= {dev['max_threads_per_block']}",
        "LOOP_UNROLL_FACTOR_X == 0 or TILING_FACTOR_X % LOOP_UNROLL_FACTOR_X == 0",
        "LOOP_UNROLL_FACTOR_Y == 0 or TILING_FACTOR_Y % LOOP_UNROLL_FACTOR_Y == 0",
        "LOOP_UNROLL_FACTOR_Z == 0 or TILING_FACTOR_Z % LOOP_UNROLL_FACTOR_Z == 0",
        f"BLOCK_SIZE_X * TILING_FACTOR_X > {cta_padding}",
        f"BLOCK_SIZE_Y * TILING_FACTOR_Y > {cta_padding}",
        f"BLOCK_SIZE_Z * TILING_FACTOR_Z > {cta_padding}",
    ])

    # run the benchmark and check for valid outcome and performance degradation
    solutions = benchmark(problem.getSolutions)
    reference_result = reference_results[benchmark_name]
    benchmark_result = benchmark.stats.stats.mean
    benchmark_results[benchmark_name] = benchmark_result
    assert len(solutions) == 138600
    check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)


 def test_dedispersion(benchmark):
    """Based on the Dedispersion search space in the paper."""
    benchmark_name = "dedispersion"

    # setup the tunable parameters
    problem = Problem()
    problem.addVariable("block_size_x", [1, 2, 4, 8] + [16 * i for i in range(1, 3)])
    problem.addVariable("block_size_y", [8 * i for i in range(4, 33)])
    problem.addVariable("block_size_z", [1])
    problem.addVariable("tile_size_x", [i for i in range(1, 5)])
    problem.addVariable("tile_size_y", [i for i in range(1, 9)])
    problem.addVariable("tile_stride_x", [0, 1])
    problem.addVariable("tile_stride_y", [0, 1])
    problem.addVariable("loop_unroll_factor_channel", [
        0
    ])

    # setup the restrictions
    check_block_size = "32 <= block_size_x * block_size_y <= 1024"
    check_tile_stride_x = "tile_size_x > 1 or tile_stride_x == 0"
    check_tile_stride_y = "tile_size_y > 1 or tile_stride_y == 0"
    problem.addConstraint([check_block_size, check_tile_stride_x, check_tile_stride_y])

    # run the benchmark and check for valid outcome and performance degradation
    solutions = benchmark(problem.getSolutions)
    reference_result = reference_results[benchmark_name]
    benchmark_result = benchmark.stats.stats.mean
    benchmark_results[benchmark_name] = benchmark_result
    assert len(solutions) == 11130
    check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)


 def test_hotspot(benchmark):
    """Based on the Hotspot search space in the paper."""
    benchmark_name = "hotspot"

    # constants
    temporal_tiling_factor = [i for i in range(1, 11)]
    max_tfactor = max(temporal_tiling_factor)

    # setup the tunable parameters
    problem = Problem()
    problem.addVariable("block_size_x", [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)])
    problem.addVariable("block_size_y", [2**i for i in range(6)])
    problem.addVariable("tile_size_x", [i for i in range(1, 11)])
    problem.addVariable("tile_size_y", [i for i in range(1, 11)])
    problem.addVariable("temporal_tiling_factor", temporal_tiling_factor)
    problem.addVariable("max_tfactor", [max_tfactor])
    problem.addVariable("loop_unroll_factor_t", [i for i in range(1, max_tfactor + 1)])
    problem.addVariable("sh_power", [0, 1])
    problem.addVariable("blocks_per_sm", [0, 1, 2, 3, 4])

    # setup the restrictions
    problem.addConstraint([
        "block_size_x*block_size_y >= 32",
        "temporal_tiling_factor % loop_unroll_factor_t == 0",
        f"block_size_x*block_size_y <= {dev['max_threads']}",
        f"(block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
        f"blocks_per_sm == 0 or (((block_size_x*tile_size_x + temporal_tiling_factor * 2) * (block_size_y*tile_size_y + temporal_tiling_factor * 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})",
    ])

    # run the benchmark and check for valid outcome and performance degradation
    solutions = benchmark(problem.getSolutions)
    reference_result = reference_results[benchmark_name]
    benchmark_result = benchmark.stats.stats.mean
    benchmark_results[benchmark_name] = benchmark_result
    assert len(solutions) == 349853
    check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -25,9 +25,23 @@ jobs:
		- uses: actions/checkout@v4
		- uses: fjwillemsen/setup-nox2@v3.0.0
		- run: \|
		nox
		nox -- ${{ runner.os }}
		- name: Store benchmark result
		uses: benchmark-action/github-action-benchmark@v1
		with:
		tool: "pytest"
		output-file-path: .benchmarks/benchmark_${{ runner.os }}_3.13.json
		gh-pages-branch: main
		benchmark-data-dir-path: docs/benchmarks
		fail-on-alert: true
		# GitHub API token to make a commit comment
		github-token: ${{ secrets.GITHUB_TOKEN }}
		comment-on-alert: true
		comment-always: true
		# alert-comment-cc-users: '@fjwillemsen' mention a GitHub user in the comment
		- name: Report to Coveralls
		uses: coverallsapp/github-action@v2
		with:
		file: coverage.xml
		format: cobertura
		fail-on-error: false
Original file line number	Diff line number	Diff line change
Expand Up		@@ -34,6 +34,7 @@ pip-log.txt
		pip-delete-this-directory.txt

		# Unit test / coverage reports
		.benchmarks
		htmlcov/
		.tox/
		.coverage
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,6 +7,7 @@

		import nox
		from nox import Session, session
		from pathlib import Path

		# from nox_poetry import Session, session # nox_poetry is a better option, but <=1.0.3 has a bug with filename-URLs

Expand All		@@ -21,6 +22,9 @@
		nox.options.stop_on_first_error = True
		nox.options.error_on_missing_interpreters = True

		# create the benchmark folder
		Path(".benchmarks").mkdir(exist_ok=True)


		# Test code quality: linting
		@session
Expand All		@@ -35,13 +39,19 @@ def lint(session: Session) -> None:
		# do not forget check / set the versions with `pyenv global`, or `pyenv local` in case of virtual environment
		def tests(session: Session) -> None:
		"""Run the tests for the specified Python versions."""
		# get command line arguments
		if session.posargs:
		os_name = session.posargs[0]
		else:
		os_name = 'local'

		# install the dev-dependencies and build the package
		session.install("poetry")
		session.run("poetry", "install", "--with", "dev,test", external=True)
		# session.poetry.installroot(distribution_format="sdist")

		# run pytest on the package with C-extensions, disable required coverage percentage
		session.run("pytest", "--no-cov")
		session.run("pytest", "--no-cov", "--benchmark-json", f".benchmarks/benchmark_{os_name}_{session.python}.json")

		# for the last Python version session:
		if session.python == python_versions_to_test[-1]:
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -65,6 +65,7 @@ sphinx-pyproject = "^0.3.0"
		optional = true
		[tool.poetry.group.test.dependencies]
		pytest = "^8.3.3"
		pytest-benchmark = "^5.1.0"
		pytest-cov = "^6.0.0"
		nox = "^2024.10.9"
		ruff = "^0.7.2"
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,247 @@
		from random import random
		from time import perf_counter
		import pytest
		from constraint import Problem
		from math import sqrt


		# reference times (using A4000 on DAS6)
		reference_microbenchmark_mean = [0.3784186691045761, 0.4737640768289566, 0.10726054509480794, 0.10744890073935191, 0.10979799057046573, 0.15360217044750848, 0.14483965436617532, 0.054416230569283165, 0.13835338006416956, 0.1371802551050981] # noqa E501
		reference_results = {
		"microhh": 1.1565620,
		"dedispersion": 0.1171140,
		"hotspot": 2.6839208,
		}
		# device properties (for A4000 on DAS6 using get_opencl_device_info.cpp)
		dev = {
		"max_threads": 1024,
		"max_threads_per_sm": 1024,
		"max_threads_per_block": 1536,
		"max_shared_memory_per_block": 49152,
		"max_shared_memory": 102400,
		"max_wi_size": [1024, 1024, 64],
		"max_wg_size": 1024,
		}
		# collect benchmark times
		benchmark_results = dict()

		@pytest.mark.skip
		def get_performance_factor(repeats=3):
		"""Run microbenchmarks to indicate how much slower this system is compared to the reference."""

		def cpu_1():
		"""Matrix multiplication"""
		size = 100
		A = [[random() for _ in range(size)] for _ in range(size)]
		B = [[random() for _ in range(size)] for _ in range(size)]
		result = [[sum(A[i][k] * B[k][j] for k in range(size)) for j in range(size)] for i in range(size)]
		return result

		def cpu_2():
		"""Element-wise arithmetic"""
		N = 10**6
		A = [random() for _ in range(N)]
		B = [random() for _ in range(N)]
		return [A[i] + B[i] for i in range(N)]

		def cpu_3():
		"""Addition"""
		N = 10**6
		return [i + i for i in range(N)]

		def cpu_4():
		"""Multiplication"""
		N = 10**6
		return [i * i for i in range(N)]

		def cpu_5():
		"""Division"""
		N = 10**6
		return [i / i for i in range(1, N+1)]

		def mem_1():
		"""Array copying"""
		N = 10**6
		A = [random() for _ in range(N)]
		return A.copy()

		def mem_2():
		"""Array slicing"""
		N = 10**6
		A = [random() for _ in range(N)]
		return A[::2]

		def mem_3():
		"""Dictionary lookup"""
		N = 10**3
		keys = list(range(N))
		values = list(range(N))
		lst = list(zip(keys, values))
		return [next((v for k, v in lst if k == i), None) for i in range(N)]

		def cache_1():
		"""Sequential array sum"""
		N = 10**6
		A = [random() for _ in range(N)]
		return sum(A)

		def cache_2():
		"""Strided array sum"""
		N = 10**6
		A = [random() for _ in range(N)]
		return sum(A[::2])

		# run the benchmarks
		benchmarks = [cpu_1, cpu_2, cpu_3, cpu_4, cpu_5, mem_1, mem_2, mem_3, cache_1, cache_2]
		raw_data = [list() for _ in range(repeats)]
		for i in range(repeats):
		for f in benchmarks:
		start = perf_counter()
		f()
		duration = perf_counter() - start
		raw_data[i].append(duration)

		# non-Numpy implementation of statistics calculation
		transposed_data = list(zip(*raw_data)) # transpose the raw_data to get columns as rows

		# calculate mean along axis=0 (column-wise) (`benchmark_data.mean(axis=0)`)
		benchmark_mean = [sum(column) / len(column) for column in transposed_data]

		# calculate standard deviation along axis=0 (column-wise)
		def stddev(column, mean):
		variance = sum((x - mean) ** 2 for x in column) / len(column)
		return sqrt(variance)

		# calculate relative standard deviation (`(benchmark_data.std(axis=0) / abs(np_benchmark_mean))`)
		benchmark_std = [stddev(column, mean) for column, mean in zip(transposed_data, benchmark_mean)]
		relative_std = [(s / abs(m)) if m != 0 else 0 for s, m in zip(benchmark_std, benchmark_mean)]

		# calculate mean relative standard deviation and apply threshold (`max(np.mean(np_relative_std), 0.125)`)
		mean_relative_std = max(sum(relative_std) / len(relative_std), 0.125)

		# calculate performance factor (`np.mean(np_benchmark_mean / reference_microbenchmark_mean)`)
		performance_factor = sum(bm / rm for bm, rm in zip(benchmark_mean, reference_microbenchmark_mean)) / len(benchmark_mean)
		return performance_factor, mean_relative_std

		performance_factor, mean_relative_std = get_performance_factor()
		print(f"\nSystem performance factor: {round(performance_factor, 3)}")

		@pytest.mark.skip
		def check_benchmark_performance(benchmark_name, mean, std):
		"""Utility function to check whether the performance of a benchmark is within the expected range and print information."""
		reference_result = reference_results[benchmark_name]
		assert mean - std * 2 <= reference_result * (performance_factor + mean_relative_std * 2)
		print(f"Reference: {round(reference_result, 3)}, benchmark: {round(mean, 3)}, expected: {round(reference_result * performance_factor, 3)}")


		def test_microhh(benchmark):
		"""Based on the MicroHH search space in the paper."""
		benchmark_name = "microhh"

		cta_padding = 0 # default argument

		# setup the tunable parameters
		problem = Problem()
		problem.addVariable("STATIC_STRIDES", [0])
		problem.addVariable("TILING_STRATEGY", [0])
		problem.addVariable("REWRITE_INTERP", [0])
		problem.addVariable("BLOCK_SIZE_X", [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024])
		problem.addVariable("BLOCK_SIZE_Y", [1, 2, 4, 8, 16, 32])
		problem.addVariable("BLOCK_SIZE_Z", [1, 2, 4])
		problem.addVariable("TILING_FACTOR_X", [1, 2, 4, 8])
		problem.addVariable("TILING_FACTOR_Y", [1, 2, 4])
		problem.addVariable("TILING_FACTOR_Z", [1, 2, 4])
		problem.addVariable("LOOP_UNROLL_FACTOR_X",[1, 2, 4, 8])
		problem.addVariable("LOOP_UNROLL_FACTOR_Y", [1, 2, 4])
		problem.addVariable("LOOP_UNROLL_FACTOR_Z", [1, 2, 4])
		problem.addVariable("BLOCKS_PER_MP", [0, 1, 2, 3, 4])

		# setup the restrictions
		problem.addConstraint([
		f"BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z * BLOCKS_PER_MP <= {dev['max_threads_per_sm']}",
		f"32 <= BLOCK_SIZE_X * BLOCK_SIZE_Y * BLOCK_SIZE_Z <= {dev['max_threads_per_block']}",
		"LOOP_UNROLL_FACTOR_X == 0 or TILING_FACTOR_X % LOOP_UNROLL_FACTOR_X == 0",
		"LOOP_UNROLL_FACTOR_Y == 0 or TILING_FACTOR_Y % LOOP_UNROLL_FACTOR_Y == 0",
		"LOOP_UNROLL_FACTOR_Z == 0 or TILING_FACTOR_Z % LOOP_UNROLL_FACTOR_Z == 0",
		f"BLOCK_SIZE_X * TILING_FACTOR_X > {cta_padding}",
		f"BLOCK_SIZE_Y * TILING_FACTOR_Y > {cta_padding}",
		f"BLOCK_SIZE_Z * TILING_FACTOR_Z > {cta_padding}",
		])

		# run the benchmark and check for valid outcome and performance degradation
		solutions = benchmark(problem.getSolutions)
		reference_result = reference_results[benchmark_name]
		benchmark_result = benchmark.stats.stats.mean
		benchmark_results[benchmark_name] = benchmark_result
		assert len(solutions) == 138600
		check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)


		def test_dedispersion(benchmark):
		"""Based on the Dedispersion search space in the paper."""
		benchmark_name = "dedispersion"

		# setup the tunable parameters
		problem = Problem()
		problem.addVariable("block_size_x", [1, 2, 4, 8] + [16 * i for i in range(1, 3)])
		problem.addVariable("block_size_y", [8 * i for i in range(4, 33)])
		problem.addVariable("block_size_z", [1])
		problem.addVariable("tile_size_x", [i for i in range(1, 5)])
		problem.addVariable("tile_size_y", [i for i in range(1, 9)])
		problem.addVariable("tile_stride_x", [0, 1])
		problem.addVariable("tile_stride_y", [0, 1])
		problem.addVariable("loop_unroll_factor_channel", [
		0
		])

		# setup the restrictions
		check_block_size = "32 <= block_size_x * block_size_y <= 1024"
		check_tile_stride_x = "tile_size_x > 1 or tile_stride_x == 0"
		check_tile_stride_y = "tile_size_y > 1 or tile_stride_y == 0"
		problem.addConstraint([check_block_size, check_tile_stride_x, check_tile_stride_y])

		# run the benchmark and check for valid outcome and performance degradation
		solutions = benchmark(problem.getSolutions)
		reference_result = reference_results[benchmark_name]
		benchmark_result = benchmark.stats.stats.mean
		benchmark_results[benchmark_name] = benchmark_result
		assert len(solutions) == 11130
		check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)


		def test_hotspot(benchmark):
		"""Based on the Hotspot search space in the paper."""
		benchmark_name = "hotspot"

		# constants
		temporal_tiling_factor = [i for i in range(1, 11)]
		max_tfactor = max(temporal_tiling_factor)

		# setup the tunable parameters
		problem = Problem()
		problem.addVariable("block_size_x", [1, 2, 4, 8, 16] + [32 * i for i in range(1, 33)])
		problem.addVariable("block_size_y", [2**i for i in range(6)])
		problem.addVariable("tile_size_x", [i for i in range(1, 11)])
		problem.addVariable("tile_size_y", [i for i in range(1, 11)])
		problem.addVariable("temporal_tiling_factor", temporal_tiling_factor)
		problem.addVariable("max_tfactor", [max_tfactor])
		problem.addVariable("loop_unroll_factor_t", [i for i in range(1, max_tfactor + 1)])
		problem.addVariable("sh_power", [0, 1])
		problem.addVariable("blocks_per_sm", [0, 1, 2, 3, 4])

		# setup the restrictions
		problem.addConstraint([
		"block_size_x*block_size_y >= 32",
		"temporal_tiling_factor % loop_unroll_factor_t == 0",
		f"block_size_x*block_size_y <= {dev['max_threads']}",
		f"(block_size_xtile_size_x + temporal_tiling_factor 2) * (block_size_ytile_size_y + temporal_tiling_factor 2) * (2+sh_power) * 4 <= {dev['max_shared_memory_per_block']}",
		f"blocks_per_sm == 0 or (((block_size_xtile_size_x + temporal_tiling_factor 2) * (block_size_ytile_size_y + temporal_tiling_factor 2) * (2+sh_power) * 4) * blocks_per_sm <= {dev['max_shared_memory']})",
		])

		# run the benchmark and check for valid outcome and performance degradation
		solutions = benchmark(problem.getSolutions)
		reference_result = reference_results[benchmark_name]
		benchmark_result = benchmark.stats.stats.mean
		benchmark_results[benchmark_name] = benchmark_result
		assert len(solutions) == 349853
		check_benchmark_performance(benchmark_name, benchmark_result, benchmark.stats.stats.stddev)