Apr 14, 2021 · Apr 14, 2021 · Apr 14, 2021 · Apr 21, 2021 · Apr 23, 2021 · Apr 23, 2021
diff --git a/dpnp/backend/kernels/dpnp_krnl_random.cpp b/dpnp/backend/kernels/dpnp_krnl_random.cpp
    _DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one;
    _DataType* Uvec = nullptr;
    _DataType* Vvec = nullptr;
    bool* result_ready = nullptr;
    bool* result_mask = nullptr;
    const _DataType d_zero = 0.0, d_one = 1.0;

    assert(kappa > 1.0);

    Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
    Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));

    result_ready = reinterpret_cast<bool*>(dpnp_memory_alloc_c(1 * sizeof(bool)));
    result_ready[0] = false;
    result_mask = reinterpret_cast<bool*>(dpnp_memory_alloc_c(size * sizeof(bool)));
    dpnp_full_c<bool>(result_ready, result_mask, size);

 for (size_t n = 0; n < size;)
 while(!result_ready[0])
    {
        size_t diff_size = size - n;
        mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, 0.5 * M_PI);
        auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
        event_out.wait();
        // TODO
        // use deps case
        auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
        mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
        event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
        event_out.wait();
        auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

        // TODO
        // kernel
        for (size_t i = 0; i < diff_size; i++)
        {
            _DataType sn, cn, sn2, cn2;
            _DataType neg_W_minus_one, V, Y;
        cl::sycl::range<1> gws(size);
        auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
            cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
            cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
                size_t i = global_id[0];
                if (!result_mask[i]) {
                    _DataType sn, cn, sn2, cn2;
                    _DataType neg_W_minus_one, V, Y;

            sn = sin(Uvec[i]);
            cn = cos(Uvec[i]);
            V = Vvec[i];
            sn2 = sn * sn;
            cn2 = cn * cn;
 sn =cl::sycl::sin(Uvec[i]);
 cn =cl::sycl::cos(Uvec[i]);
 V = Vvec[i];
 sn2 = sn * sn;
 cn2 = cn * cn;

            neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
            Y = kappa * (s_minus_one + neg_W_minus_one);
 neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
 Y = kappa * (s_minus_one + neg_W_minus_one);

            if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
            {
                Y = neg_W_minus_one * (2 - neg_W_minus_one);
                if (Y < 0)
                    Y = 0.0;
                else if (Y > 1.0)
                    Y = 1.0;
                    if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
                    {
                        Y = neg_W_minus_one * (2 - neg_W_minus_one);
                        if (Y < 0)
                            Y = 0.0;
                        else if (Y > 1.0)
                            Y = 1.0;

                        result1[i] = cl::sycl::asin(cl::sycl::sqrt(Y));
                        result_mask[i] = true;
                    }
                }
            });
        };
        auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
        some_event.wait();

                result1[n++] = asin(sqrt(Y));
            }
        }
        dpnp_all_c<bool, bool>(result_mask, result_ready, size);
    }

    dpnp_memory_free_c(Uvec);
    dpnp_memory_free_c(result_ready);
    dpnp_memory_free_c(result_mask);

    mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
    auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
    _DataType rho_over_kappa, rho, r, s_kappa;
    _DataType* Uvec = nullptr;
    _DataType* Vvec = nullptr;
    bool* result_ready = nullptr;
    bool* result_mask = nullptr;

    const _DataType d_zero = 0.0, d_one = 1.0;

    Uvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));
    Vvec = reinterpret_cast<_DataType*>(dpnp_memory_alloc_c(size * sizeof(_DataType)));

    for (size_t n = 0; n < size;)
    result_ready = reinterpret_cast<bool*>(dpnp_memory_alloc_c(1 * sizeof(bool)));
    result_ready[0] = false;
    result_mask = reinterpret_cast<bool*>(dpnp_memory_alloc_c(size * sizeof(bool)));
    dpnp_full_c<bool>(result_ready, result_mask, size);

    while (!result_ready[0])
    {
        size_t diff_size = size - n;
        mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, M_PI);
        auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
        event_out.wait();
        // TODO
        // use deps case
        auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
        mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
        event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
        event_out.wait();
        auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

        // TODO
        // kernel
        for (size_t i = 0; i < diff_size; i++)
        {
            _DataType Z, W, Y, V;
            Z = cos(Uvec[i]);
            V = Vvec[i];
            W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
            Y = s_kappa - kappa * W;
            if ((Y * (2 - Y) >= V) || (log(Y / V) + 1 >= Y))
            {
                result1[n++] = acos(W);
            }
        }
    }
        cl::sycl::range<1> gws((size));

        auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
            cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
            cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
                size_t i = global_id[0];
                if (!result_mask[i]) {
                    _DataType Z, W, Y, V;
                    Z = cl::sycl::cos(Uvec[i]);
                    V = Vvec[i];
                    W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
                    Y = s_kappa - kappa * W;
                    if ((Y * (2 - Y) >= V) || (cl::sycl::log(Y / V) + 1 >= Y))
                    {
                        result1[i] = cl::sycl::acos(W);
                        result_mask[i] = true;
                    }
                }
            });
        };
        auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
        some_event.wait();

        dpnp_all_c<bool, bool>(result_mask, result_ready, size);
    }
    dpnp_memory_free_c(Uvec);
    dpnp_memory_free_c(result_ready);
    dpnp_memory_free_c(result_mask);

    mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
    auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
diff --git a/tests/test_random.py b/tests/test_random.py
    @pytest.mark.parametrize("kappa", [5.0, 0.5], ids=['large_kappa', 'small_kappa'])
    def test_seed(self, kappa):
        seed = 28041990
        size =10
        size =1000
        mu = 2.
        dpnp.random.seed(seed)
        a1 = dpnp.asarray(dpnp.random.vonmises(mu, kappa, size=size))
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1261,6 +1261,8 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da
		_DataType s_minus_one, hpt, r_over_two_kappa_minus_one, rho_minus_one;
		_DataType* Uvec = nullptr;
		_DataType* Vvec = nullptr;
		bool* result_ready = nullptr;
		bool* result_mask = nullptr;
		const _DataType d_zero = 0.0, d_one = 1.0;

		assert(kappa > 1.0);
Expand All		@@ -1276,49 +1278,59 @@ void dpnp_rng_vonmises_large_kappa_c(void* result, const _DataType mu, const _Da

		Uvec = reinterpret_cast<_DataType>(dpnp_memory_alloc_c(size sizeof(_DataType)));
		Vvec = reinterpret_cast<_DataType>(dpnp_memory_alloc_c(size sizeof(_DataType)));

		result_ready = reinterpret_cast<bool>(dpnp_memory_alloc_c(1 sizeof(bool)));
		result_ready[0] = false;
		result_mask = reinterpret_cast<bool>(dpnp_memory_alloc_c(size sizeof(bool)));
		dpnp_full_c<bool>(result_ready, result_mask, size);

		for (size_t n = 0; n < size;)
		while(!result_ready[0])
		{
		size_t diff_size = size - n;
		mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, 0.5 * M_PI);
		auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
		event_out.wait();
		// TODO
		// use deps case
		auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
		mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
		event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
		event_out.wait();
		auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

		// TODO
		// kernel
		for (size_t i = 0; i < diff_size; i++)
		{
		_DataType sn, cn, sn2, cn2;
		_DataType neg_W_minus_one, V, Y;
		cl::sycl::range<1> gws(size);
		auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
Copy link Contributor shssfMay 13, 2021• edited Loading Choose a reason for hiding this comment The reason will be displayed to describe this comment to others.Learn more. Kernel inside the loop with bigger trip count. It would be more efficient to parallelize (make kernel) the algorithm by bigger value`size` instead`size-n`. So, it will require a loop inside the kernel. It is questionable what will be more performant loop with a kernels queue (data dependent) kernel with a loop It is hard to predict it with no perf measurements but I would vote that parallelization with bigger number of threads should be better.
		cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
		cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
		size_t i = global_id[0];
		if (!result_mask[i]) {
		_DataType sn, cn, sn2, cn2;
		_DataType neg_W_minus_one, V, Y;

		sn = sin(Uvec[i]);
		cn = cos(Uvec[i]);
		V = Vvec[i];
		sn2 = sn * sn;
		cn2 = cn * cn;
		sn =cl::sycl::sin(Uvec[i]);
		cn =cl::sycl::cos(Uvec[i]);
		V = Vvec[i];
		sn2 = sn * sn;
		cn2 = cn * cn;

		neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
		Y = kappa * (s_minus_one + neg_W_minus_one);
		neg_W_minus_one = s_minus_one * sn2 / (0.5 * s_minus_one + cn2);
		Y = kappa * (s_minus_one + neg_W_minus_one);

		if ((Y * (2 - Y) >= V) \|\| (log(Y / V) + 1 >= Y))
		{
		Y = neg_W_minus_one * (2 - neg_W_minus_one);
		if (Y < 0)
		Y = 0.0;
		else if (Y > 1.0)
		Y = 1.0;
		if ((Y * (2 - Y) >= V) \|\| (cl::sycl::log(Y / V) + 1 >= Y))
		{
		Y = neg_W_minus_one * (2 - neg_W_minus_one);
		if (Y < 0)
		Y = 0.0;
		else if (Y > 1.0)
		Y = 1.0;

		result1[i] = cl::sycl::asin(cl::sycl::sqrt(Y));
		result_mask[i] = true;
		}
		}
		});
		};
		auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
		some_event.wait();

		result1[n++] = asin(sqrt(Y));
		}
		}
		dpnp_all_c<bool, bool>(result_mask, result_ready, size);
		}

		dpnp_memory_free_c(Uvec);
		dpnp_memory_free_c(result_ready);
		dpnp_memory_free_c(result_mask);

		mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
		auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
Expand DownExpand Up		@@ -1357,6 +1369,8 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
		_DataType rho_over_kappa, rho, r, s_kappa;
		_DataType* Uvec = nullptr;
		_DataType* Vvec = nullptr;
		bool* result_ready = nullptr;
		bool* result_mask = nullptr;

		const _DataType d_zero = 0.0, d_one = 1.0;

Expand All		@@ -1372,35 +1386,46 @@ void dpnp_rng_vonmises_small_kappa_c(void* result, const _DataType mu, const _Da
		Uvec = reinterpret_cast<_DataType>(dpnp_memory_alloc_c(size sizeof(_DataType)));
		Vvec = reinterpret_cast<_DataType>(dpnp_memory_alloc_c(size sizeof(_DataType)));

		for (size_t n = 0; n < size;)
		result_ready = reinterpret_cast<bool>(dpnp_memory_alloc_c(1 sizeof(bool)));
		result_ready[0] = false;
		result_mask = reinterpret_cast<bool>(dpnp_memory_alloc_c(size sizeof(bool)));
		dpnp_full_c<bool>(result_ready, result_mask, size);

		while (!result_ready[0])
		{
		size_t diff_size = size - n;
		mkl_rng::uniform<_DataType> uniform_distribution_u(d_zero, M_PI);
		auto event_out = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, diff_size, Uvec);
		event_out.wait();
		// TODO
		// use deps case
		auto uniform_distr_u_event = mkl_rng::generate(uniform_distribution_u, DPNP_RNG_ENGINE, size, Uvec);
		mkl_rng::uniform<_DataType> uniform_distribution_v(d_zero, d_one);
		event_out = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, diff_size, Vvec);
		event_out.wait();
		auto uniform_distr_v_event = mkl_rng::generate(uniform_distribution_v, DPNP_RNG_ENGINE, size, Vvec);

		// TODO
		// kernel
		for (size_t i = 0; i < diff_size; i++)
		{
		_DataType Z, W, Y, V;
		Z = cos(Uvec[i]);
		V = Vvec[i];
		W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
		Y = s_kappa - kappa * W;
		if ((Y * (2 - Y) >= V) \|\| (log(Y / V) + 1 >= Y))
		{
		result1[n++] = acos(W);
		}
		}
		}
		cl::sycl::range<1> gws((size));

		auto paral_kernel_some = [&](cl::sycl::handler& cgh) {
		cgh.depends_on({uniform_distr_u_event, uniform_distr_v_event});
		cgh.parallel_for(gws, [=](cl::sycl::id<1> global_id) {
		size_t i = global_id[0];
		if (!result_mask[i]) {
		_DataType Z, W, Y, V;
		Z = cl::sycl::cos(Uvec[i]);
		V = Vvec[i];
		W = (kappa + s_kappa * Z) / (s_kappa + kappa * Z);
		Y = s_kappa - kappa * W;
		if ((Y * (2 - Y) >= V) \|\| (cl::sycl::log(Y / V) + 1 >= Y))
		{
		result1[i] = cl::sycl::acos(W);
		result_mask[i] = true;
		}
		}
		});
		};
		auto some_event = DPNP_QUEUE.submit(paral_kernel_some);
		some_event.wait();

		dpnp_all_c<bool, bool>(result_mask, result_ready, size);
		}
		dpnp_memory_free_c(Uvec);
		dpnp_memory_free_c(result_ready);
		dpnp_memory_free_c(result_mask);

		mkl_rng::uniform<_DataType> uniform_distribution(d_zero, d_one);
		auto uniform_distr_event = mkl_rng::generate(uniform_distribution, DPNP_RNG_ENGINE, size, Vvec);
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -875,7 +875,7 @@ def test_invalid_args(self):
		@pytest.mark.parametrize("kappa", [5.0, 0.5], ids=['large_kappa', 'small_kappa'])
		def test_seed(self, kappa):
		seed = 28041990
		size =10
		size =1000
		mu = 2.
		dpnp.random.seed(seed)
		a1 = dpnp.asarray(dpnp.random.vonmises(mu, kappa, size=size))
Expand Down