diff --git a/stan/math/opencl/copy.hpp b/stan/math/opencl/copy.hpp index 51838ffa171..e3599e5c933 100644 --- a/stan/math/opencl/copy.hpp +++ b/stan/math/opencl/copy.hpp @@ -97,10 +97,12 @@ inline auto from_matrix_cl(const T& src) { } else { try { cl::Event copy_event; - const cl::CommandQueue queue = opencl_context.queue(); + const cl::CommandQueue& queue = opencl_context.queue(); + std::vector copy_write_events(src.write_events().begin(), + src.write_events().end()); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, sizeof(T_val) * dst.size(), dst.data(), - &src.write_events(), ©_event); + ©_write_events, ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { @@ -150,9 +152,11 @@ inline T_dst from_matrix_cl(const matrix_cl& src) { "dst.cols()", 1); try { cl::Event copy_event; - const cl::CommandQueue queue = opencl_context.queue(); + const cl::CommandQueue& queue = opencl_context.queue(); + std::vector copy_write_events(src.write_events().begin(), + src.write_events().end()); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, - sizeof(T), &dst, &src.write_events(), ©_event); + sizeof(T), &dst, ©_write_events, ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { @@ -182,10 +186,12 @@ inline T_dst from_matrix_cl(const matrix_cl& src) { } try { cl::Event copy_event; - const cl::CommandQueue queue = opencl_context.queue(); + const cl::CommandQueue& queue = opencl_context.queue(); + std::vector copy_write_events(src.write_events().begin(), + src.write_events().end()); queue.enqueueReadBuffer(src.buffer(), opencl_context.in_order(), 0, sizeof(T) * src.rows(), dst.data(), - &src.write_events(), ©_event); + ©_write_events, ©_event); copy_event.wait(); src.clear_write_events(); } catch (const cl::Error& e) { @@ -251,13 +257,14 @@ inline auto packed_copy(const T& src) { return dst; } try { - const cl::CommandQueue queue = opencl_context.queue(); + const cl::CommandQueue& queue = opencl_context.queue(); matrix_cl packed(packed_size, 1); stan::math::opencl_kernels::pack(cl::NDRange(src.rows(), src.rows()), packed, src, src.rows(), src.rows(), src.view()); const std::vector mat_events - = vec_concat(packed.read_write_events(), src.write_events()); + = vec_concat(std::vector{}, packed.read_write_events(), + src.write_events()); cl::Event copy_event; queue.enqueueReadBuffer(packed.buffer(), opencl_context.in_order(), 0, sizeof(T_val) * packed_size, dst.data(), @@ -303,7 +310,7 @@ inline matrix_cl packed_copy(Vec&& src, int rows) { try { matrix_cl packed(packed_size, 1); cl::Event packed_event; - const cl::CommandQueue queue = opencl_context.queue(); + const cl::CommandQueue& queue = opencl_context.queue(); queue.enqueueWriteBuffer( packed.buffer(), opencl_context.in_order() || std::is_rvalue_reference::value, 0, diff --git a/stan/math/opencl/kernel_cl.hpp b/stan/math/opencl/kernel_cl.hpp index de2706cccd6..8f0b6a66e7d 100644 --- a/stan/math/opencl/kernel_cl.hpp +++ b/stan/math/opencl/kernel_cl.hpp @@ -109,17 +109,17 @@ inline void assign_events(const cl::Event& new_event, CallArg& m, * @return A vector of OpenCL events. */ template * = nullptr> -inline std::vector select_events(const T& m) { - return {}; +inline tbb::concurrent_vector select_events(const T& m) { + return tbb::concurrent_vector{}; } template * = nullptr, require_same_t* = nullptr> -inline const std::vector& select_events(const K& m) { +inline const tbb::concurrent_vector& select_events(const K& m) { return m.write_events(); } template * = nullptr, require_any_same_t* = nullptr> -inline std::vector select_events(K& m) { +inline tbb::concurrent_vector select_events(K& m) { static_assert(!std::is_const::value, "Can not write to const matrix_cl!"); return m.read_write_events(); } @@ -133,9 +133,9 @@ inline std::vector select_events(K& m) { * @param sources A std::vector of strings containing the code for the kernel. * @param options The values of macros to be passed at compile time. */ -inline auto compile_kernel(const char* name, - const std::vector& sources, - const std::map& options) { +inline auto compile_kernel( + const char* name, const std::vector& sources, + const std::unordered_map& options) { auto base_opts = opencl_context.base_opts(); for (auto& it : options) { if (base_opts[it.first] > it.second) { @@ -175,7 +175,7 @@ struct kernel_cl { private: const char* name_; std::vector sources_; - std::map opts_; + std::unordered_map opts_; mutable cl::Kernel kernel_; public: @@ -187,7 +187,7 @@ struct kernel_cl { * @param options The values of macros to be passed at compile time. */ kernel_cl(const char* name, std::vector sources, - std::map options = {}) + std::unordered_map options = {}) : name_(name), sources_(std::move(sources)), opts_(std::move(options)) {} /** \ingroup kernel_executor_opencl @@ -205,7 +205,8 @@ struct kernel_cl { opencl_context.register_kernel_cache(&kernel_); } cl::EnqueueArgs eargs(opencl_context.queue(), - vec_concat(internal::select_events(args)...), + vec_concat(std::vector{}, + internal::select_events(args)...), global_thread_size); cl::KernelFunctor&...> kernel_functor( kernel_); @@ -232,7 +233,8 @@ struct kernel_cl { opencl_context.register_kernel_cache(&kernel_); } cl::EnqueueArgs eargs(opencl_context.queue(), - vec_concat(internal::select_events(args)...), + vec_concat(std::vector{}, + internal::select_events(args)...), global_thread_size, thread_block_size); cl::KernelFunctor&...> kernel_functor( kernel_); diff --git a/stan/math/opencl/kernel_generator/append.hpp b/stan/math/opencl/kernel_generator/append.hpp index 9f982cccb57..c29cbe40284 100644 --- a/stan/math/opencl/kernel_generator/append.hpp +++ b/stan/math/opencl/kernel_generator/append.hpp @@ -88,8 +88,8 @@ class append_row_ : public operation_cl, * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -101,7 +101,7 @@ class append_row_ : public operation_cl, true); std::string row_index_name_b = "(" + row_index_name + " - " + var_name_ + "_first_rows)"; - std::map generated_b; + std::unordered_map generated_b; kernel_parts parts_b = this->template get_arg<1>().get_kernel_parts( generated_b, generated_all, name_gen, row_index_name_b, col_index_name, true); @@ -129,14 +129,15 @@ class append_row_ : public operation_cl, * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; this->template get_arg<0>().set_args(generated, generated_all, kernel, arg_num); - std::map generated_b; + std::unordered_map generated_b; this->template get_arg<1>().set_args(generated_b, generated_all, kernel, arg_num); kernel.setArg(arg_num++, this->template get_arg<0>().rows()); @@ -250,8 +251,8 @@ class append_col_ : public operation_cl, * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -263,7 +264,7 @@ class append_col_ : public operation_cl, true); std::string col_index_name_b = "(" + col_index_name + " - " + var_name_ + "_first_cols)"; - std::map generated_b; + std::unordered_map generated_b; kernel_parts parts_b = this->template get_arg<1>().get_kernel_parts( generated_b, generated_all, name_gen, row_index_name, col_index_name_b, true); @@ -291,14 +292,15 @@ class append_col_ : public operation_cl, * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; this->template get_arg<0>().set_args(generated, generated_all, kernel, arg_num); - std::map generated_b; + std::unordered_map generated_b; this->template get_arg<1>().set_args(generated_b, generated_all, kernel, arg_num); kernel.setArg(arg_num++, this->template get_arg<0>().cols()); diff --git a/stan/math/opencl/kernel_generator/as_column_vector_or_scalar.hpp b/stan/math/opencl/kernel_generator/as_column_vector_or_scalar.hpp index cc995636933..b2c91b1b0ac 100644 --- a/stan/math/opencl/kernel_generator/as_column_vector_or_scalar.hpp +++ b/stan/math/opencl/kernel_generator/as_column_vector_or_scalar.hpp @@ -71,8 +71,8 @@ class as_column_vector_or_scalar_ * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -82,7 +82,7 @@ class as_column_vector_or_scalar_ std::string row_index_name_arg = row_index_name; std::string col_index_name_arg = col_index_name; modify_argument_indices(row_index_name_arg, col_index_name_arg); - std::map generated2; + std::unordered_map generated2; res = this->template get_arg<0>().get_kernel_parts( generated2, generated_all, name_gen, row_index_name_arg, col_index_name_arg, view_handled); @@ -134,8 +134,8 @@ class as_column_vector_or_scalar_ * @return part of kernel with code for this expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { if (generated.count(this) == 0) { @@ -145,7 +145,7 @@ class as_column_vector_or_scalar_ std::string row_index_name_arg = row_index_name; std::string col_index_name_arg = col_index_name; modify_argument_indices(row_index_name_arg, col_index_name_arg); - std::map generated2; + std::unordered_map generated2; kernel_parts res = this->template get_arg<0>().get_kernel_parts_lhs( generated2, generated_all, name_gen, row_index_name_arg, col_index_name_arg); @@ -185,12 +185,13 @@ class as_column_vector_or_scalar_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; - std::map generated2; + std::unordered_map generated2; this->template get_arg<0>().set_args(generated2, generated_all, kernel, arg_num); if (generated_all.count(this) == 0) { diff --git a/stan/math/opencl/kernel_generator/block_zero_based.hpp b/stan/math/opencl/kernel_generator/block_zero_based.hpp index 97afe96606c..4c0eeaeb2ac 100644 --- a/stan/math/opencl/kernel_generator/block_zero_based.hpp +++ b/stan/math/opencl/kernel_generator/block_zero_based.hpp @@ -101,8 +101,8 @@ class block_ * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -112,7 +112,7 @@ class block_ std::string row_index_name_arg = row_index_name; std::string col_index_name_arg = col_index_name; modify_argument_indices(row_index_name_arg, col_index_name_arg); - std::map generated2; + std::unordered_map generated2; res = this->template get_arg<0>().get_kernel_parts( generated2, generated_all, name_gen, row_index_name_arg, col_index_name_arg, view_handled); @@ -175,8 +175,8 @@ class block_ * @return part of kernel with code for this expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { if (generated.count(this) == 0) { @@ -186,7 +186,7 @@ class block_ std::string row_index_name_arg = row_index_name; std::string col_index_name_arg = col_index_name; modify_argument_indices(row_index_name_arg, col_index_name_arg); - std::map generated2; + std::unordered_map generated2; kernel_parts res = this->template get_arg<0>().get_kernel_parts_lhs( generated2, generated_all, name_gen, row_index_name_arg, col_index_name_arg); @@ -226,12 +226,13 @@ class block_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; - std::map generated2; + std::unordered_map generated2; this->template get_arg<0>().set_args(generated2, generated_all, kernel, arg_num); if (generated_all.count(this) == 0) { diff --git a/stan/math/opencl/kernel_generator/calc_if.hpp b/stan/math/opencl/kernel_generator/calc_if.hpp index 79475bdef3d..f9b5ef5ee6c 100644 --- a/stan/math/opencl/kernel_generator/calc_if.hpp +++ b/stan/math/opencl/kernel_generator/calc_if.hpp @@ -66,10 +66,10 @@ class calc_if_ */ template kernel_parts get_whole_kernel_parts( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, - const T_result& result) const { + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const T_result& result) const { if (Do_Calculate) { return this->template get_arg<0>().get_whole_kernel_parts( generated, generated_all, ng, row_index_name, col_index_name, result); @@ -88,9 +88,10 @@ class calc_if_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (Do_Calculate) { this->template get_arg<0>().set_args(generated, generated_all, kernel, arg_num); diff --git a/stan/math/opencl/kernel_generator/check_cl.hpp b/stan/math/opencl/kernel_generator/check_cl.hpp index 2af27c85dc1..85b675b7540 100644 --- a/stan/math/opencl/kernel_generator/check_cl.hpp +++ b/stan/math/opencl/kernel_generator/check_cl.hpp @@ -78,8 +78,8 @@ class check_cl_ : public operation_cl_lhs, bool> { * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { kernel_parts res; @@ -110,9 +110,10 @@ class check_cl_ : public operation_cl_lhs, bool> { * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { generated[this] = ""; arg_.set_args(generated, generated_all, kernel, arg_num); kernel.setArg(arg_num++, buffer_.buffer()); diff --git a/stan/math/opencl/kernel_generator/colwise_reduction.hpp b/stan/math/opencl/kernel_generator/colwise_reduction.hpp index 7a623e68f25..e402ad1edeb 100644 --- a/stan/math/opencl/kernel_generator/colwise_reduction.hpp +++ b/stan/math/opencl/kernel_generator/colwise_reduction.hpp @@ -96,10 +96,10 @@ class colwise_reduction */ template kernel_parts get_whole_kernel_parts( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, - const T_result& result) const { + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const T_result& result) const { kernel_parts parts = derived().get_kernel_parts( generated, generated_all, ng, row_index_name, col_index_name, false); kernel_parts out_parts = result.get_kernel_parts_lhs( diff --git a/stan/math/opencl/kernel_generator/constant.hpp b/stan/math/opencl/kernel_generator/constant.hpp index 313309e8746..078b08fd0f7 100644 --- a/stan/math/opencl/kernel_generator/constant.hpp +++ b/stan/math/opencl/kernel_generator/constant.hpp @@ -80,9 +80,10 @@ class constant_ : public operation_cl, T> { * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; kernel.setArg(arg_num++, a_); diff --git a/stan/math/opencl/kernel_generator/indexing.hpp b/stan/math/opencl/kernel_generator/indexing.hpp index d57cf612775..e997531c19b 100644 --- a/stan/math/opencl/kernel_generator/indexing.hpp +++ b/stan/math/opencl/kernel_generator/indexing.hpp @@ -92,8 +92,8 @@ class indexing_ * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -110,7 +110,7 @@ class indexing_ kernel_parts parts_col_idx = col_index.get_kernel_parts( generated, generated_all, name_gen, row_index_name, col_index_name, view_handled); - std::map generated2; + std::unordered_map generated2; kernel_parts parts_mat = mat.get_kernel_parts( generated2, generated_all, name_gen, row_index.var_name_, col_index.var_name_, false); @@ -134,8 +134,8 @@ class indexing_ * @return part of kernel with code for this expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { if (generated.count(this) == 0) { @@ -151,7 +151,7 @@ class indexing_ kernel_parts parts_col_idx = col_index.get_kernel_parts(generated, generated_all, name_gen, row_index_name, col_index_name, false); - std::map generated2; + std::unordered_map generated2; kernel_parts parts_mat = mat.get_kernel_parts_lhs(generated2, generated_all, name_gen, row_index.var_name_, col_index.var_name_); @@ -171,16 +171,17 @@ class indexing_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; this->template get_arg<1>().set_args(generated, generated_all, kernel, arg_num); this->template get_arg<2>().set_args(generated, generated_all, kernel, arg_num); - std::map generated2; + std::unordered_map generated2; this->template get_arg<0>().set_args(generated2, generated_all, kernel, arg_num); } diff --git a/stan/math/opencl/kernel_generator/load.hpp b/stan/math/opencl/kernel_generator/load.hpp index 6f48252d89e..8615b4aefc2 100644 --- a/stan/math/opencl/kernel_generator/load.hpp +++ b/stan/math/opencl/kernel_generator/load.hpp @@ -77,8 +77,8 @@ class load_ * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -146,8 +146,8 @@ class load_ * @return part of kernel with code for this expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { if (generated_all.count(&a_) == 0) { @@ -193,9 +193,10 @@ class load_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated_all.count(&a_) == 0) { generated_all[&a_] = ""; kernel.setArg(arg_num++, a_.buffer()); @@ -324,9 +325,9 @@ class load_ * @param[in,out] id_map map from memory addresses to unique ids * @param[in,out] next_id neqt unique id to use */ - inline void get_unique_matrix_accesses(std::vector& uids, - std::map& id_map, - int& next_id) const { + inline void get_unique_matrix_accesses( + std::vector& uids, std::unordered_map& id_map, + int& next_id) const { if (id_map.count(&a_) == 0) { id_map[&a_] = next_id; uids.push_back(next_id); diff --git a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp index 167cd44ed13..3fe16779676 100644 --- a/stan/math/opencl/kernel_generator/multi_result_kernel.hpp +++ b/stan/math/opencl/kernel_generator/multi_result_kernel.hpp @@ -128,9 +128,10 @@ struct multi_result_kernel_internal { * @return kernel parts for the kernel */ static kernel_parts generate( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const std::tuple...>& assignment_pairs) { kernel_parts parts @@ -157,9 +158,9 @@ struct multi_result_kernel_internal { * @param assignment_pairs pairs of result and expression */ static void set_args( - std::map& generated, - std::map& generated_all, cl::Kernel& kernel, - int& arg_num, + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num, const std::tuple...>& assignment_pairs) { next::set_args(generated, generated_all, kernel, arg_num, @@ -192,7 +193,7 @@ struct multi_result_kernel_internal { * @param assignment_pairs pairs of result and expression */ static void get_unique_matrix_accesses( - std::vector& uids, std::map& id_map, + std::vector& uids, std::unordered_map& id_map, int& next_id, const std::tuple...>& assignment_pairs) { @@ -221,18 +222,19 @@ struct multi_result_kernel_internal<-1, T_results...> { assignment_pairs) {} static kernel_parts generate( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const std::tuple...>& assignment_pairs) { return {}; } static void set_args( - std::map& generated, - std::map& generated_all, cl::Kernel& kernel, - int& arg_num, + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num, const std::tuple...>& assignment_pairs) {} @@ -241,7 +243,7 @@ struct multi_result_kernel_internal<-1, T_results...> { assignment_pairs) {} static void get_unique_matrix_accesses( - std::vector& uids, std::map& id_map, + std::vector& uids, std::unordered_map& id_map, int& next_id, const std::tuple...>& assignment_pairs) {} @@ -439,8 +441,8 @@ class results_cl { {std::decay_t::Deriv::require_specific_local_size...}); name_generator ng; - std::map generated; - std::map generated_all; + std::unordered_map generated; + std::unordered_map generated_all; kernel_parts parts = impl::generate(generated, generated_all, ng, "i", "j", assignment_pairs); std::string src; @@ -529,7 +531,7 @@ class results_cl { } std::vector uids; - std::map id_map; + std::unordered_map id_map; int next_id = 0; impl::get_unique_matrix_accesses(uids, id_map, next_id, assignment_pairs); @@ -545,8 +547,8 @@ class results_cl { cl::Kernel& kernel = impl::kernel_cache_[uids]; int arg_num = 0; - std::map generated; - std::map generated_all; + std::unordered_map generated; + std::unordered_map generated_all; impl::set_args(generated, generated_all, kernel, arg_num, assignment_pairs); diff --git a/stan/math/opencl/kernel_generator/opencl_code.hpp b/stan/math/opencl/kernel_generator/opencl_code.hpp index 23444da1668..ab53950e14b 100644 --- a/stan/math/opencl/kernel_generator/opencl_code.hpp +++ b/stan/math/opencl/kernel_generator/opencl_code.hpp @@ -181,12 +181,11 @@ class opencl_code_ : public operation_cl_base { * @param view_handled whether caller already handled matrix view * @return part of kernel with code for this and nested expressions */ - auto get_kernel_parts(std::map& generated, - std::map& generated_all, - name_generator& name_gen, - const std::string& row_index_name, - const std::string& col_index_name, - bool view_handled) const { + auto get_kernel_parts( + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& name_gen, const std::string& row_index_name, + const std::string& col_index_name, bool view_handled) const { return impl_->get_kernel_parts(generated, generated_all, name_gen, row_index_name, col_index_name, view_handled); @@ -202,8 +201,8 @@ class opencl_code_ : public operation_cl_base { * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - auto set_args(std::map& generated, - std::map& generated_all, + auto set_args(std::unordered_map& generated, + std::unordered_map& generated_all, cl::Kernel& kernel, int& arg_num) const { return impl_->set_args(generated, generated_all, kernel, arg_num); } @@ -300,7 +299,7 @@ class opencl_code_ : public operation_cl_base { * @param[in,out] next_id neqt unique id to use */ auto get_unique_matrix_accesses(std::vector& uids, - std::map& id_map, + std::unordered_map& id_map, int& next_id) const { return impl_->get_unique_matrix_accesses(uids, id_map, next_id); } diff --git a/stan/math/opencl/kernel_generator/operation_cl.hpp b/stan/math/opencl/kernel_generator/operation_cl.hpp index 43a50b59654..fcef4aa1c14 100644 --- a/stan/math/opencl/kernel_generator/operation_cl.hpp +++ b/stan/math/opencl/kernel_generator/operation_cl.hpp @@ -212,10 +212,10 @@ class operation_cl : public operation_cl_base { */ template kernel_parts get_whole_kernel_parts( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, - const T_result& result) const { + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const T_result& result) const { kernel_parts parts = derived().get_kernel_parts( generated, generated_all, ng, row_index_name, col_index_name, false); kernel_parts out_parts = result.get_kernel_parts_lhs( @@ -238,8 +238,8 @@ class operation_cl : public operation_cl_base { * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -250,7 +250,7 @@ class operation_cl : public operation_cl_base { std::string col_index_name_arg = col_index_name; derived().modify_argument_indices(row_index_name_arg, col_index_name_arg); std::array args_parts = index_apply([&](auto... Is) { - std::map generated2; + std::unordered_map generated2; return std::array{this->get_arg().get_kernel_parts( &Derived::modify_argument_indices == &operation_cl::modify_argument_indices @@ -312,9 +312,10 @@ class operation_cl : public operation_cl_base { * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; // parameter pack expansion returns a comma-separated list of values, @@ -323,7 +324,7 @@ class operation_cl : public operation_cl_base { // initializer_list from. Cast to voids avoids warnings about unused // expression. index_apply([&](auto... Is) { - std::map generated2; + std::unordered_map generated2; static_cast(std::initializer_list{ (this->get_arg().set_args( &Derived::modify_argument_indices @@ -452,9 +453,9 @@ class operation_cl : public operation_cl_base { * @param[in,out] id_map map from memory addresses to unique ids * @param[in,out] next_id neqt unique id to use */ - inline void get_unique_matrix_accesses(std::vector& uids, - std::map& id_map, - int& next_id) const { + inline void get_unique_matrix_accesses( + std::vector& uids, std::unordered_map& id_map, + int& next_id) const { index_apply([&](auto... Is) { static_cast(std::initializer_list{( this->get_arg().get_unique_matrix_accesses(uids, id_map, next_id), diff --git a/stan/math/opencl/kernel_generator/operation_cl_lhs.hpp b/stan/math/opencl/kernel_generator/operation_cl_lhs.hpp index 3410dd24eb4..eb0ccc0212e 100644 --- a/stan/math/opencl/kernel_generator/operation_cl_lhs.hpp +++ b/stan/math/opencl/kernel_generator/operation_cl_lhs.hpp @@ -47,8 +47,8 @@ class operation_cl_lhs : public operation_cl, * @return part of kernel with code for this expressions */ inline kernel_parts get_kernel_parts_lhs( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) const { if (generated.count(this) == 0) { @@ -59,7 +59,7 @@ class operation_cl_lhs : public operation_cl, std::string col_index_name_arg = col_index_name; derived().modify_argument_indices(row_index_name_arg, col_index_name_arg); std::array args_parts = index_apply([&](auto... Is) { - std::map generated2; + std::unordered_map generated2; return std::array{ this->template get_arg().get_kernel_parts_lhs( &Derived::modify_argument_indices diff --git a/stan/math/opencl/kernel_generator/optional_broadcast.hpp b/stan/math/opencl/kernel_generator/optional_broadcast.hpp index 722c0dc661f..01cd3e2528d 100644 --- a/stan/math/opencl/kernel_generator/optional_broadcast.hpp +++ b/stan/math/opencl/kernel_generator/optional_broadcast.hpp @@ -100,12 +100,13 @@ class optional_broadcast_ * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; - std::map generated2; + std::unordered_map generated2; this->template get_arg<0>().set_args(generated2, generated_all, kernel, arg_num); if (Colwise) { diff --git a/stan/math/opencl/kernel_generator/reduction_2d.hpp b/stan/math/opencl/kernel_generator/reduction_2d.hpp index ed3131a888b..33c957ea5f6 100644 --- a/stan/math/opencl/kernel_generator/reduction_2d.hpp +++ b/stan/math/opencl/kernel_generator/reduction_2d.hpp @@ -80,10 +80,10 @@ class reduction_2d */ template kernel_parts get_whole_kernel_parts( - std::map& generated, - std::map& generated_all, name_generator& ng, - const std::string& row_index_name, const std::string& col_index_name, - const T_result& result) const { + std::unordered_map& generated, + std::unordered_map& generated_all, + name_generator& ng, const std::string& row_index_name, + const std::string& col_index_name, const T_result& result) const { kernel_parts parts = derived().get_kernel_parts( generated, generated_all, ng, row_index_name, col_index_name, false); kernel_parts out_parts = result.get_kernel_parts_lhs( diff --git a/stan/math/opencl/kernel_generator/rowwise_reduction.hpp b/stan/math/opencl/kernel_generator/rowwise_reduction.hpp index 5ff44dd1fff..0dfdec2d5a4 100644 --- a/stan/math/opencl/kernel_generator/rowwise_reduction.hpp +++ b/stan/math/opencl/kernel_generator/rowwise_reduction.hpp @@ -31,8 +31,8 @@ struct matvec_mul_opt { static matrix_cl_view view(const Arg&) { return matrix_cl_view::Entire; } static kernel_parts get_kernel_parts( - const Arg& a, std::map& generated, - std::map& generated_all, + const Arg& a, std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) { return {}; @@ -71,8 +71,8 @@ struct matvec_mul_opt>> { * @return part of kernel with code for this and nested expressions */ static kernel_parts get_kernel_parts( - const Arg& mul, std::map& generated, - std::map& generated_all, + const Arg& mul, std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name) { kernel_parts res{}; @@ -154,8 +154,8 @@ class rowwise_reduction * @return part of kernel with code for this and nested expressions */ inline kernel_parts get_kernel_parts( - std::map& generated, - std::map& generated_all, + std::unordered_map& generated, + std::unordered_map& generated_all, name_generator& name_gen, const std::string& row_index_name, const std::string& col_index_name, bool view_handled) const { kernel_parts res{}; @@ -163,7 +163,7 @@ class rowwise_reduction this->var_name_ = name_gen.generate(); generated[this] = ""; - std::map generated2; + std::unordered_map generated2; if (PassZero && internal::matvec_mul_opt::is_possible) { res = internal::matvec_mul_opt::get_kernel_parts( this->template get_arg<0>(), generated2, generated_all, name_gen, @@ -245,12 +245,13 @@ class rowwise_reduction * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; - std::map generated2; + std::unordered_map generated2; this->template get_arg<0>().set_args(generated2, generated_all, kernel, arg_num); kernel.setArg(arg_num++, this->template get_arg<0>().view()); diff --git a/stan/math/opencl/kernel_generator/scalar.hpp b/stan/math/opencl/kernel_generator/scalar.hpp index 91476cb1230..34216ea9349 100644 --- a/stan/math/opencl/kernel_generator/scalar.hpp +++ b/stan/math/opencl/kernel_generator/scalar.hpp @@ -72,9 +72,10 @@ class scalar_ : public operation_cl, T> { * @param[in,out] arg_num consecutive number of the first argument to set. * This is incremented for each argument set by this function. */ - inline void set_args(std::map& generated, - std::map& generated_all, - cl::Kernel& kernel, int& arg_num) const { + inline void set_args( + std::unordered_map& generated, + std::unordered_map& generated_all, + cl::Kernel& kernel, int& arg_num) const { if (generated.count(this) == 0) { generated[this] = ""; kernel.setArg(arg_num++, a_); diff --git a/stan/math/opencl/matrix_cl.hpp b/stan/math/opencl/matrix_cl.hpp index 0fd044f9b02..bf276bb19f5 100644 --- a/stan/math/opencl/matrix_cl.hpp +++ b/stan/math/opencl/matrix_cl.hpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,8 @@ class matrix_cl : public matrix_cl_base { int cols_{0}; // Number of columns. // Holds info on if matrix is a special type matrix_cl_view view_{matrix_cl_view::Entire}; - mutable std::vector write_events_; // Tracks write jobs - mutable std::vector read_events_; // Tracks reads + mutable tbb::concurrent_vector write_events_; // Tracks write jobs + mutable tbb::concurrent_vector read_events_; // Tracks reads public: using Scalar = T; // Underlying type of the matrix @@ -99,7 +100,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The write event stack. */ - inline const std::vector& write_events() const { + inline const tbb::concurrent_vector& write_events() const { return write_events_; } @@ -107,7 +108,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The read/write event stack. */ - inline const std::vector& read_events() const { + inline const tbb::concurrent_vector& read_events() const { return read_events_; } @@ -115,7 +116,7 @@ class matrix_cl : public matrix_cl_base { * Get the events from the event stacks. * @return The read/write event stack. */ - inline const std::vector read_write_events() const { + inline const tbb::concurrent_vector read_write_events() const { return vec_concat(this->read_events(), this->write_events()); } @@ -615,15 +616,28 @@ class matrix_cl : public matrix_cl_base { * @param A matrix_cl */ void initialize_buffer_cl(const matrix_cl& A) { + cl::Event cstr_event; + std::vector* dep_events = new std::vector( + A.write_events().begin(), A.write_events().end()); try { - cl::Event cstr_event; opencl_context.queue().enqueueCopyBuffer(A.buffer(), this->buffer(), 0, 0, - A.size() * sizeof(T), - &A.write_events(), &cstr_event); + A.size() * sizeof(T), dep_events, + &cstr_event); + if (opencl_context.device()[0].getInfo()) { + buffer_cl_.setDestructorCallback( + &delete_it_destructor>, dep_events); + } else { + cstr_event.setCallback( + CL_COMPLETE, &delete_it_event>, dep_events); + } this->add_write_event(cstr_event); A.add_read_event(cstr_event); } catch (const cl::Error& e) { + delete dep_events; check_opencl_error("copy (OpenCL)->(OpenCL)", e); + } catch (...) { + delete dep_events; + throw; } } diff --git a/stan/math/opencl/opencl_context.hpp b/stan/math/opencl/opencl_context.hpp index 764051a5737..e2373df126d 100644 --- a/stan/math/opencl/opencl_context.hpp +++ b/stan/math/opencl/opencl_context.hpp @@ -14,10 +14,11 @@ #include #include +#include #include #include #include -#include +#include #include #include #include @@ -160,7 +161,7 @@ class opencl_context_base { // the device bool in_order_; // Whether to use out of order execution. // Holds Default parameter values for each Kernel. - using map_base_opts = std::map; + using map_base_opts = std::unordered_map; map_base_opts base_opts_ = {{"LOWER", static_cast(matrix_cl_view::Lower)}, {"UPPER", static_cast(matrix_cl_view::Upper)}, @@ -193,7 +194,7 @@ class opencl_context_base { } tuning_opts_; protected: - static opencl_context_base& getInstance() { + static opencl_context_base& getInstance() noexcept { static opencl_context_base instance_; return instance_; } @@ -207,7 +208,7 @@ class opencl_context_base { * The API to access the methods and values in opencl_context_base */ class opencl_context { - std::vector kernel_caches_; + tbb::concurrent_vector kernel_caches_; public: opencl_context() = default; @@ -351,7 +352,7 @@ class opencl_context { * objects. For stan, there should only be one context, queue, device, and * program with multiple kernels. */ - inline cl::Context& context() { + inline cl::Context& context() noexcept { return opencl_context_base::getInstance().context_; } /** \ingroup opencl_context_group @@ -359,13 +360,13 @@ class opencl_context { * One command queue will exist per device where * kernels are placed on the command queue and by default executed in order. */ - inline cl::CommandQueue& queue() { + inline cl::CommandQueue& queue() noexcept { return opencl_context_base::getInstance().command_queue_; } /** \ingroup opencl_context_group * Returns a copy of the map of kernel defines */ - inline opencl_context_base::map_base_opts& base_opts() { + inline opencl_context_base::map_base_opts& base_opts() noexcept { return opencl_context_base::getInstance().base_opts_; } /** \ingroup opencl_context_group @@ -375,35 +376,35 @@ class opencl_context { * max workgroup of 256 would allow thread blocks of sizes (16,16), (128,2), * (8, 32), etc. */ - inline int max_thread_block_size() { + inline int max_thread_block_size() noexcept { return opencl_context_base::getInstance().max_thread_block_size_; } /** \ingroup opencl_context_group * Returns the thread block size for the Cholesky Decompositions L_11. */ - inline opencl_context_base::tuning_struct& tuning_opts() { + inline opencl_context_base::tuning_struct& tuning_opts() noexcept { return opencl_context_base::getInstance().tuning_opts_; } /** \ingroup opencl_context_group * Returns a vector containing the OpenCL device used to create the context */ - inline std::vector& device() { + inline std::vector& device() noexcept { return opencl_context_base::getInstance().device_; } /** \ingroup opencl_context_group * Returns a vector containing the OpenCL platform used to create the context */ - inline std::vector& platform() { + inline std::vector& platform() noexcept { return opencl_context_base::getInstance().platform_; } /** \ingroup opencl_context_group * Return a bool representing whether the write to the OpenCL device are * blocking */ - inline bool& in_order() { + inline bool& in_order() noexcept { return opencl_context_base::getInstance().in_order_; } diff --git a/stan/math/opencl/prim/normal_lccdf.hpp b/stan/math/opencl/prim/normal_lccdf.hpp index 24a49a70f6f..e68008509f8 100644 --- a/stan/math/opencl/prim/normal_lccdf.hpp +++ b/stan/math/opencl/prim/normal_lccdf.hpp @@ -64,12 +64,12 @@ return_type_t normal_lccdf( auto sigma_positive_expr = 0 < sigma_val; auto scaled_diff = elt_divide(y_val - mu_val, sigma_val * SQRT_TWO); - auto one_m_erf = select( + matrix_cl one_m_erf = select( scaled_diff < -37.5 * INV_SQRT_TWO, 2.0, select(scaled_diff < -5.0 * INV_SQRT_TWO, 2.0 - erfc(-scaled_diff), select(scaled_diff > 8.25 * INV_SQRT_TWO, 0.0, 1.0 - erf(scaled_diff)))); - auto lccdf_expr = colwise_sum(log(one_m_erf)); + auto lccdf_expr = log(one_m_erf); auto mu_deriv = select(scaled_diff > 8.25 * INV_SQRT_TWO, INFTY, SQRT_TWO_OVER_SQRT_PI * elt_divide(exp(-square(scaled_diff)), @@ -88,7 +88,9 @@ return_type_t normal_lccdf( = expressions(lccdf_expr, calc_if::value>(y_deriv), calc_if::value>(mu_deriv), calc_if::value>(sigma_deriv)); - T_partials_return lccdf = LOG_HALF + sum(from_matrix_cl(lccdf_cl)); + + T_partials_return lccdf + = LOG_HALF * lccdf_cl.size() + sum(from_matrix_cl(lccdf_cl)); auto ops_partials = make_partials_propagator(y_col, mu_col, sigma_col); diff --git a/stan/math/opencl/zeros_strict_tri.hpp b/stan/math/opencl/zeros_strict_tri.hpp index 3ff0b92842a..339f987cb59 100644 --- a/stan/math/opencl/zeros_strict_tri.hpp +++ b/stan/math/opencl/zeros_strict_tri.hpp @@ -44,7 +44,7 @@ inline void matrix_cl::zeros_strict_tri() try { return; } this->view_ = both(this->view_, invert(matrix_view)); - cl::CommandQueue cmdQueue = opencl_context.queue(); + cl::CommandQueue& cmdQueue = opencl_context.queue(); opencl_kernels::fill_strict_tri(cl::NDRange(this->rows(), this->cols()), *this, 0.0, this->rows(), this->cols(), matrix_view); diff --git a/stan/math/prim/fun/vec_concat.hpp b/stan/math/prim/fun/vec_concat.hpp index ed8a5df0c21..67cd6f5b599 100644 --- a/stan/math/prim/fun/vec_concat.hpp +++ b/stan/math/prim/fun/vec_concat.hpp @@ -37,7 +37,9 @@ inline void append_vectors(VecInOut& x) {} template inline void append_vectors(VecInOut& x, const VecIn& y, const VecArgs&... args) { - x.insert(x.end(), y.begin(), y.end()); + for (auto& yy : y) { + x.push_back(yy); + } append_vectors(x, args...); } } // namespace internal @@ -53,7 +55,7 @@ inline void append_vectors(VecInOut& x, const VecIn& y, */ template inline auto vec_concat(const Vec& v1, const Args&... args) { - std::vector> vec; + Vec vec; vec.reserve(internal::sum_vector_sizes(v1, args...)); internal::append_vectors(vec, v1, args...); return vec;