From 298ef0d69d8d8a226834833a283a6d9e84a75c23 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 14:05:15 +0200 Subject: [PATCH 1/7] use logging --- examples/simgauss_tf.py | 2 +- src/vegasflow/configflow.py | 20 +++++++++++++++++++- src/vegasflow/monte_carlo.py | 13 ++++++++----- src/vegasflow/vflow.py | 9 ++++++--- 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/examples/simgauss_tf.py b/examples/simgauss_tf.py index 333262d..a6d8ed9 100644 --- a/examples/simgauss_tf.py +++ b/examples/simgauss_tf.py @@ -1,8 +1,8 @@ # Place your function here +from vegasflow.configflow import DTYPE, DTYPEINT import time import numpy as np import tensorflow as tf -from vegasflow.configflow import DTYPE, DTYPEINT from vegasflow.vflow import vegas_wrapper from vegasflow.plain import plain_wrapper diff --git a/src/vegasflow/configflow.py b/src/vegasflow/configflow.py index a390e7c..1684069 100644 --- a/src/vegasflow/configflow.py +++ b/src/vegasflow/configflow.py @@ -1,8 +1,26 @@ """ Define some constants, header style """ +import os +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Most of this can be moved to a yaml file without loss of generality import tensorflow as tf +# uncomment this line for debugging to avoid compiling any tf.function +# tf.config.experimental_run_functions_eagerly(True) + +# Configure logging +import logging + +module_name = __name__.split(".")[0] +logger = logging.getLogger(module_name) +# Set level debug for development +logger.setLevel(logging.DEBUG) +# Create a handler and format it +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) +console_format = logging.Formatter("[%(levelname)s] %(message)s") +console_handler.setFormatter(console_format) +logger.addHandler(console_handler) # Define the tf.numberic types DTYPE = tf.float64 @@ -15,7 +33,7 @@ # Set up the logistics of the integration # Events Limit limits how many events are done in one single run of the event_loop # set it lower if hitting memory problems -MAX_EVENTS_LIMIT = int(1e7) +MAX_EVENTS_LIMIT = int(1e6) # Select the list of devices to look for DEFAULT_ACTIVE_DEVICES = ["GPU"] # , 'CPU'] diff --git a/src/vegasflow/monte_carlo.py b/src/vegasflow/monte_carlo.py index 7f3968a..a2b5722 100644 --- a/src/vegasflow/monte_carlo.py +++ b/src/vegasflow/monte_carlo.py @@ -41,6 +41,9 @@ import tensorflow as tf from vegasflow.configflow import MAX_EVENTS_LIMIT, DEFAULT_ACTIVE_DEVICES, DTYPE +import logging +logger = logging.getLogger(__name__) + def print_iteration(it, res, error, extra="", threshold=0.1): """ Checks the size of the result to select between @@ -48,9 +51,9 @@ def print_iteration(it, res, error, extra="", threshold=0.1): # note: actually, the flag 'g' does this automatically # but I prefer to choose the precision myself... if res < threshold: - print(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra) + logger.info(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra) else: - print(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra) + logger.info(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra) def _accumulate(accumulators): """ Accumulate all the quantities in accumulators @@ -135,8 +138,8 @@ def events_per_run(self, val): """ Set the number of events per single step """ self._events_per_run = min(val, self.n_events) if self.n_events % self._events_per_run != 0: - print( - f"Warning, the number of events per run step {self._events_per_run} doesn't perfectly" + logger.warning( + f"The number of events per run step {self._events_per_run} doesn't perfectly" f"divide the number of events {self.n_events}, which can harm performance" ) @@ -379,7 +382,7 @@ def run_integration(self, n_iter, log_time=True, histograms=None): final_result = aux_res / weight_sum sigma = np.sqrt(1.0 / weight_sum) - print(f" > Final results: {final_result.numpy():g} +/- {sigma:g}") + logger.info(f" > Final results: {final_result.numpy():g} +/- {sigma:g}") return final_result, sigma diff --git a/src/vegasflow/vflow.py b/src/vegasflow/vflow.py index 1b14572..9c9d758 100644 --- a/src/vegasflow/vflow.py +++ b/src/vegasflow/vflow.py @@ -14,6 +14,9 @@ from vegasflow.monte_carlo import MonteCarloFlow, wrapper from vegasflow.utils import consume_array_into_indices +import logging +logger = logging.getLogger(__name__) + FBINS = float_me(BINS_MAX) # Auxiliary functions for Vegas @@ -232,8 +235,8 @@ def load_grid(self, file_name=None, numpy_grid=None): integrand_name = self.integrand.__name__ integrand_grid = json_dict.get("integrand") if integrand_name != integrand_grid: - print( - f"WARNING: The grid was written for the integrand: {integrand_grid}" + logger.warning( + f"The grid was written for the integrand: {integrand_grid}" f"which is different from {integrand_name}" ) # Now that everything is clear, let's load up the grid @@ -255,7 +258,7 @@ def load_grid(self, file_name=None, numpy_grid=None): f"current settings is of {self.grid_bins} bins" ) if file_name: - print(f" > SUCCESS: Loaded grid from {file_name}") + logger.info(f" > SUCCESS: Loaded grid from {file_name}") self.divisions.assign(numpy_grid) def refine_grid(self, arr_res2): From 88af86e386f5809904255109d2210053e3ef646a Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 14:05:26 +0200 Subject: [PATCH 2/7] reformatting --- src/vegasflow/__init__.py | 2 +- src/vegasflow/configflow.py | 2 ++ src/vegasflow/monte_carlo.py | 2 ++ src/vegasflow/plain.py | 4 +--- src/vegasflow/vflow.py | 14 ++++---------- 5 files changed, 10 insertions(+), 14 deletions(-) diff --git a/src/vegasflow/__init__.py b/src/vegasflow/__init__.py index 40784c8..7b91e6c 100644 --- a/src/vegasflow/__init__.py +++ b/src/vegasflow/__init__.py @@ -1,3 +1,3 @@ """Monte Carlo integration with Tensorflow""" -__version__ = '1.0.2' +__version__ = "1.0.2" diff --git a/src/vegasflow/configflow.py b/src/vegasflow/configflow.py index 1684069..89cbdca 100644 --- a/src/vegasflow/configflow.py +++ b/src/vegasflow/configflow.py @@ -2,9 +2,11 @@ Define some constants, header style """ import os + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Most of this can be moved to a yaml file without loss of generality import tensorflow as tf + # uncomment this line for debugging to avoid compiling any tf.function # tf.config.experimental_run_functions_eagerly(True) diff --git a/src/vegasflow/monte_carlo.py b/src/vegasflow/monte_carlo.py index a2b5722..046cb15 100644 --- a/src/vegasflow/monte_carlo.py +++ b/src/vegasflow/monte_carlo.py @@ -42,6 +42,7 @@ from vegasflow.configflow import MAX_EVENTS_LIMIT, DEFAULT_ACTIVE_DEVICES, DTYPE import logging + logger = logging.getLogger(__name__) @@ -55,6 +56,7 @@ def print_iteration(it, res, error, extra="", threshold=0.1): else: logger.info(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra) + def _accumulate(accumulators): """ Accumulate all the quantities in accumulators The default accumulation is implemented for tensorflow tensors diff --git a/src/vegasflow/plain.py b/src/vegasflow/plain.py index 4edb249..4bfda36 100644 --- a/src/vegasflow/plain.py +++ b/src/vegasflow/plain.py @@ -21,9 +21,7 @@ def _run_event(self, integrand, ncalls=None): # Jacobian xjac = 1.0 / self.n_events # Generate all random number for this iteration - rnds = tf.random.uniform( - (n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE - ) + rnds = tf.random.uniform((n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE) # Compute the integrand tmp = integrand(rnds, n_dim=self.n_dim, weight=xjac) * xjac tmp2 = tf.square(tmp) diff --git a/src/vegasflow/vflow.py b/src/vegasflow/vflow.py index 9c9d758..773e937 100644 --- a/src/vegasflow/vflow.py +++ b/src/vegasflow/vflow.py @@ -15,6 +15,7 @@ from vegasflow.utils import consume_array_into_indices import logging + logger = logging.getLogger(__name__) FBINS = float_me(BINS_MAX) @@ -138,10 +139,7 @@ def while_body(bin_weight, n_bin, cur, prev): prev = fzero for _ in range(BINS_MAX - 1): bin_weight, n_bin, cur, prev = tf.while_loop( - while_check, - while_body, - (bin_weight, n_bin, cur, prev), - parallel_iterations=1, + while_check, while_body, (bin_weight, n_bin, cur, prev), parallel_iterations=1, ) bin_weight -= ave_t delta = (cur - prev) * bin_weight / wei_t[n_bin] @@ -273,9 +271,7 @@ def refine_grid(self, arr_res2): Function not compiled """ for j in range(self.n_dim): - new_divisions = refine_grid_per_dimension( - arr_res2[j, :], self.divisions[j, :] - ) + new_divisions = refine_grid_per_dimension(arr_res2[j, :], self.divisions[j, :]) self.divisions[j, :].assign(new_divisions) def _run_event(self, integrand, ncalls=None): @@ -320,9 +316,7 @@ def _run_event(self, integrand, ncalls=None): # If the training is active, save the result of the integral sq for j in range(self.n_dim): arr_res2.append( - consume_array_into_indices( - tmp2, ind[:, j : j + 1], self.grid_bins - 1 - ) + consume_array_into_indices(tmp2, ind[:, j : j + 1], self.grid_bins - 1) ) arr_res2 = tf.reshape(arr_res2, (self.n_dim, -1)) From 27c43f9ddf2d6fa7a76c20e09fcbd51bf8d7d98e Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 16:28:00 +0200 Subject: [PATCH 3/7] add the c++ integrand --- examples/cuda/cuda_example.py | 28 ++++++++++++++++++ examples/cuda/integrand.cpp | 55 +++++++++++++++++++++++++++++++++++ examples/cuda/makefile | 15 ++++++++++ 3 files changed, 98 insertions(+) create mode 100644 examples/cuda/cuda_example.py create mode 100644 examples/cuda/integrand.cpp create mode 100644 examples/cuda/makefile diff --git a/examples/cuda/cuda_example.py b/examples/cuda/cuda_example.py new file mode 100644 index 0000000..fad1537 --- /dev/null +++ b/examples/cuda/cuda_example.py @@ -0,0 +1,28 @@ +from vegasflow.configflow import DTYPE, DTYPEINT +import time +import numpy as np +import tensorflow as tf +from vegasflow.plain import plain_wrapper + +# MC integration setup +dim = 4 +ncalls = np.int32(1e4) +n_iter = 5 + +integrand_module = tf.load_op_library('./integrand.so') + +@tf.function +def wrapper_integrand(xarr, **kwargs): + return integrand_module.integrand_op(xarr) + +@tf.function +def fully_python_integrand(xarr, **kwargs): + return tf.reduce_sum(xarr, axis=1) + +if __name__ == "__main__": + print(f"VEGAS MC, ncalls={ncalls}:") + start = time.time() + ncalls = 10*ncalls + r = plain_wrapper(wrapper_integrand, dim, n_iter, ncalls) + end = time.time() + print(f"Vegas took: time (s): {end-start}") diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp new file mode 100644 index 0000000..09427b9 --- /dev/null +++ b/examples/cuda/integrand.cpp @@ -0,0 +1,55 @@ +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" + +/* + * In this example we follow the TF guide for operation creation + * https://www.tensorflow.org/guide/create_op + * to create an integrand as a custom operators. + * + * To first approximation, these operators are function that take + * a tensor and return a tensor. + */ + +using namespace tensorflow; + +REGISTER_OP("IntegrandOp") +.Input("xarr: double") +.Output("ret: double") +.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); + return Status::OK(); + }); + +class IntegrandOp: public OpKernel { + public: + explicit IntegrandOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab the input tensor + // the input tenjsor is expected to have a (nevents, ndim) shape + const Tensor& input_tensor = context->input(0); + auto input = input_tensor.tensor(); + auto input_shape = input_tensor.shape(); + + // Create an output tensor + // the expected shape is (nevents,) + Tensor* output_tensor = NULL; + TensorShape output_shape; + const int N = input_shape.dim_size(0); + output_shape.AddDim(N); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, + &output_tensor)); + auto output_flat = output_tensor->flat(); + + // Sum in the dimensional axis + for (int i = 0; i < N; i++) { + output_flat(i) = 0.0; + for(int j = 0; j < input_shape.dim_size(1); j++) { + output_flat(i) += input(i,j); + } + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp); diff --git a/examples/cuda/makefile b/examples/cuda/makefile new file mode 100644 index 0000000..2ef2729 --- /dev/null +++ b/examples/cuda/makefile @@ -0,0 +1,15 @@ +target_cpu=integrand.so +cpu_src=integrand.cpp + +TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null` +TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null` + +CXX=g++ +CXFLAGS=-std=c++14 -shared -fPIC -O2 + +run_cpu: $(target_cpu) + @python cuda_example.py + +$(target_cpu): $(cpu_src) + @echo "Compiling CPU kernel" + @$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS) From 497310f5fe9bcdd88156a3942f0fbd849948853b Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 19:36:20 +0200 Subject: [PATCH 4/7] at least it compiles --- examples/cuda/cuda_example.py | 2 +- examples/cuda/integrand.cpp | 86 ++++++++++++++++++++++++---------- examples/cuda/integrand.cu.cpp | 35 ++++++++++++++ examples/cuda/integrand.h | 21 +++++++++ examples/cuda/makefile | 21 ++++++++- 5 files changed, 137 insertions(+), 28 deletions(-) create mode 100644 examples/cuda/integrand.cu.cpp create mode 100644 examples/cuda/integrand.h diff --git a/examples/cuda/cuda_example.py b/examples/cuda/cuda_example.py index fad1537..167ff71 100644 --- a/examples/cuda/cuda_example.py +++ b/examples/cuda/cuda_example.py @@ -6,7 +6,7 @@ # MC integration setup dim = 4 -ncalls = np.int32(1e4) +ncalls = np.int32(1e6) n_iter = 5 integrand_module = tf.load_op_library('./integrand.so') diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp index 09427b9..a0d7056 100644 --- a/examples/cuda/integrand.cpp +++ b/examples/cuda/integrand.cpp @@ -1,6 +1,9 @@ +//#include "cuda_kernel.h" + #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/op_kernel.h" +#include "integrand.h" /* * In this example we follow the TF guide for operation creation @@ -13,43 +16,74 @@ using namespace tensorflow; -REGISTER_OP("IntegrandOp") -.Input("xarr: double") -.Output("ret: double") -.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { - c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); - return Status::OK(); - }); +using GPUDevice = Eigen::GpuDevice; +using CPUDevice = Eigen::ThreadPoolDevice; +// CPU +template +struct IntegrandOpFunctor { + void operator()(const CPUDevice &d, const T *input, T *output, const int nevents, const int dims) { + for (int i = 0; i < nevents; i++) { + output[i] = 0.0; + for(int j = 0; j < dims; j++) { + output[i] += input[i,j]; + } + } + } +}; + + +/* The input and output type must be coherent with the types used in tensorflow + * at this moment we are using float64 as default for vegasflow. + * + * The output shape is set to be (input_shape[0], ), i.e., number of events + */ +//REGISTER_OP("IntegrandOp") +// .Input("xarr: double") +// .Output("ret: double") +// .SetShapeFn([](shape_inference::InferenceContext* c) { +// c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); +// return Status::OK(); +// }); + +template class IntegrandOp: public OpKernel { public: - explicit IntegrandOp(OpKernelConstruction* context) : OpKernel(context) {} + explicit IntegrandOp(OpKernelConstruction* context): OpKernel(context) {} void Compute(OpKernelContext* context) override { - // Grab the input tensor - // the input tenjsor is expected to have a (nevents, ndim) shape + // Grab input tensor, which is expected to be of shape (nevents, ndim) const Tensor& input_tensor = context->input(0); - auto input = input_tensor.tensor(); + auto input = input_tensor.tensor().data(); auto input_shape = input_tensor.shape(); - // Create an output tensor - // the expected shape is (nevents,) - Tensor* output_tensor = NULL; + // Create an output tensor of shape (nevents,) + Tensor* output_tensor = nullptr; TensorShape output_shape; const int N = input_shape.dim_size(0); + const int dims = input_shape.dim_size(1); output_shape.AddDim(N); - OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, - &output_tensor)); - auto output_flat = output_tensor->flat(); - - // Sum in the dimensional axis - for (int i = 0; i < N; i++) { - output_flat(i) = 0.0; - for(int j = 0; j < input_shape.dim_size(1); j++) { - output_flat(i) += input(i,j); - } - } + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor)); + + auto output_flat = output_tensor->flat().data(); + + // Perform the actual computation + IntegrandOpFunctor()( + context->eigen_device(), input, output_flat, N, dims + ); } }; -REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp); +// Register the CPU version of the kernel +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU).TypeConstraint("T"), IntegrandOp); +REGISTER_CPU(double); + +// Register the GPU version +#ifdef KERNEL_CUDA +#define REGISTER_GPU(T) \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + extern template class ExampleFunctor; \ + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU).TypeConstraint("T"),IntegrandOp); +REGISTER_GPU(double); +#endif diff --git a/examples/cuda/integrand.cu.cpp b/examples/cuda/integrand.cu.cpp new file mode 100644 index 0000000..f977503 --- /dev/null +++ b/examples/cuda/integrand.cu.cpp @@ -0,0 +1,35 @@ +#if KERNEL_CUDA +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/op_kernel.h" +#include "integrand.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + +// This is the kernel that does the actual computation on device +template +__global__ void IntegrandOpKernel(const T *input, T *output, const int nevents, const int ndim) { + const auto gid = blockIdx.x*blockDim.x + threadIdx.x; + // note: this an example of usage, not an example of a very optimal anything... + for (int i = gid; i < nevents; i += blockDim.x*gridDim.x) { + output[i] = 2.0; + for (int j = 0; j < ndim; j++) { + output[i] += input[i,j]; + } + } +} + +// But it still needs to be launched from within C++ +// this bit is to be compared with the functor at the top of integrand.cpp +template +void IntegrandOpFunctor::operator()(const GPUDevice &d, const T *input, T *output, const int nevents, const int dims) { + const int block_count = 1024; + const int thread_per_block = 20; + IntegrandOpKernel<<>>(input, output, nevents, dims); +} + +template struct IntegrandOpFunctor; + + +#endif diff --git a/examples/cuda/integrand.h b/examples/cuda/integrand.h new file mode 100644 index 0000000..026d030 --- /dev/null +++ b/examples/cuda/integrand.h @@ -0,0 +1,21 @@ +#ifndef KERNEL_INTEGRAND_ +#define KERNEL_INTEGRAND_ + +namespace tensorflow { + using Eigen::GpuDevice; + + template + struct IntegrandOpFunctor { + void operator()(const Device &d, const T *input, T *output, const int nevents, const int dims); + }; + +#if KERNEL_CUDA + template + struct IntegrandOpFunctor { + void operator()(const Eigen::GpuDevice &d, const T *input, T *output, const int nevents, const int dims); + }; +#endif + +} + +#endif diff --git a/examples/cuda/makefile b/examples/cuda/makefile index 2ef2729..8fdb236 100644 --- a/examples/cuda/makefile +++ b/examples/cuda/makefile @@ -1,15 +1,34 @@ target_cpu=integrand.so cpu_src=integrand.cpp +target_gpu=integrand.cu.o +gpu_src=integrand.cu.cpp + TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null` TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null` CXX=g++ -CXFLAGS=-std=c++14 -shared -fPIC -O2 +NCC=nvcc +CXFLAGS=-std=c++11 -shared -fPIC -O2 +NCCFLAGS=-std=c++11 -D KERNEL_CUDA=1 -x cu -Xcompiler -fPIC +NLFLAGS=-L/opt/cuda/lib64 -lcudart + +.PHONY: run_cpu run_gpu clean run_cpu: $(target_cpu) @python cuda_example.py +run_gpu: $(target_gpu) $(target_cpu) + @python cuda_example.py + $(target_cpu): $(cpu_src) @echo "Compiling CPU kernel" @$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS) + +$(target_gpu): $(gpu_src) + @echo "Compiling GPU kernel" + @$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS) + @$(CXX) $(CXFLAGS) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS) + +clean: + rm -f $(target_gpu) $(target_gpu) integrand.so From bf48a9f0b7c5afa85ea408c46a60b03324033686 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 19:38:20 +0200 Subject: [PATCH 5/7] now it runs --- examples/cuda/integrand.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp index a0d7056..28916f0 100644 --- a/examples/cuda/integrand.cpp +++ b/examples/cuda/integrand.cpp @@ -38,13 +38,13 @@ struct IntegrandOpFunctor { * * The output shape is set to be (input_shape[0], ), i.e., number of events */ -//REGISTER_OP("IntegrandOp") -// .Input("xarr: double") -// .Output("ret: double") -// .SetShapeFn([](shape_inference::InferenceContext* c) { -// c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); -// return Status::OK(); -// }); +REGISTER_OP("IntegrandOp") + .Input("xarr: double") + .Output("ret: double") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); + return Status::OK(); + }); template class IntegrandOp: public OpKernel { @@ -76,7 +76,7 @@ class IntegrandOp: public OpKernel { // Register the CPU version of the kernel #define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU).TypeConstraint("T"), IntegrandOp); + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp); REGISTER_CPU(double); // Register the GPU version @@ -84,6 +84,6 @@ REGISTER_CPU(double); #define REGISTER_GPU(T) \ /* Declare explicit instantiations in kernel_example.cu.cc. */ \ extern template class ExampleFunctor; \ - REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU).TypeConstraint("T"),IntegrandOp); + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU),IntegrandOp); REGISTER_GPU(double); #endif From 2315f17b974537009e80cd242a1c77f2cb857186 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 27 Jul 2020 19:46:46 +0200 Subject: [PATCH 6/7] fix copy and paste ... --- examples/cuda/integrand.cpp | 2 +- examples/cuda/integrand.cu.cpp | 2 +- examples/cuda/makefile | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp index 28916f0..f2e436c 100644 --- a/examples/cuda/integrand.cpp +++ b/examples/cuda/integrand.cpp @@ -83,7 +83,7 @@ REGISTER_CPU(double); #ifdef KERNEL_CUDA #define REGISTER_GPU(T) \ /* Declare explicit instantiations in kernel_example.cu.cc. */ \ - extern template class ExampleFunctor; \ + extern template class IntegrandOpFunctor; \ REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU),IntegrandOp); REGISTER_GPU(double); #endif diff --git a/examples/cuda/integrand.cu.cpp b/examples/cuda/integrand.cu.cpp index f977503..d4c61c7 100644 --- a/examples/cuda/integrand.cu.cpp +++ b/examples/cuda/integrand.cu.cpp @@ -13,7 +13,7 @@ __global__ void IntegrandOpKernel(const T *input, T *output, const int nevents, const auto gid = blockIdx.x*blockDim.x + threadIdx.x; // note: this an example of usage, not an example of a very optimal anything... for (int i = gid; i < nevents; i += blockDim.x*gridDim.x) { - output[i] = 2.0; + output[i] = 0.0; for (int j = 0; j < ndim; j++) { output[i] += input[i,j]; } diff --git a/examples/cuda/makefile b/examples/cuda/makefile index 8fdb236..2d19d2e 100644 --- a/examples/cuda/makefile +++ b/examples/cuda/makefile @@ -10,7 +10,8 @@ TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_l CXX=g++ NCC=nvcc CXFLAGS=-std=c++11 -shared -fPIC -O2 -NCCFLAGS=-std=c++11 -D KERNEL_CUDA=1 -x cu -Xcompiler -fPIC +KERNEL=-D KERNEL_CUDA=1 +NCCFLAGS=-std=c++11 $(KERNEL) -x cu -Xcompiler -fPIC NLFLAGS=-L/opt/cuda/lib64 -lcudart .PHONY: run_cpu run_gpu clean @@ -18,7 +19,7 @@ NLFLAGS=-L/opt/cuda/lib64 -lcudart run_cpu: $(target_cpu) @python cuda_example.py -run_gpu: $(target_gpu) $(target_cpu) +run_gpu: $(target_gpu) @python cuda_example.py $(target_cpu): $(cpu_src) @@ -28,7 +29,7 @@ $(target_cpu): $(cpu_src) $(target_gpu): $(gpu_src) @echo "Compiling GPU kernel" @$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS) - @$(CXX) $(CXFLAGS) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS) + @$(CXX) $(CXFLAGS) $(KERNEL) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS) clean: - rm -f $(target_gpu) $(target_gpu) integrand.so + rm -f $(target_gpu) $(target_cpu) From 92ef4c0aa5112814bf35be2d0b11dbfe5b8d3997 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Tue, 28 Jul 2020 09:54:48 +0200 Subject: [PATCH 7/7] clean makefile --- examples/cuda/makefile | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/examples/cuda/makefile b/examples/cuda/makefile index 2d19d2e..3afd78e 100644 --- a/examples/cuda/makefile +++ b/examples/cuda/makefile @@ -1,35 +1,34 @@ -target_cpu=integrand.so -cpu_src=integrand.cpp - -target_gpu=integrand.cu.o -gpu_src=integrand.cu.cpp +target_lib=integrand.so TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null` TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null` CXX=g++ -NCC=nvcc CXFLAGS=-std=c++11 -shared -fPIC -O2 -KERNEL=-D KERNEL_CUDA=1 -NCCFLAGS=-std=c++11 $(KERNEL) -x cu -Xcompiler -fPIC -NLFLAGS=-L/opt/cuda/lib64 -lcudart +KERNEL_DEF=-D KERNEL_CUDA=1 +NCCFLAGS=-std=c++11 $(KERNEL_DEF) -x cu -Xcompiler -fPIC --disable-warnings -.PHONY: run_cpu run_gpu clean +# Check whether there's nvcc +ifeq (,$(shell which nvcc 2>/dev/null)) +else + NCC:=nvcc + NCCLIB:=$(subst bin/nvcc,lib64, $(shell which nvcc)) + CXFLAGS+=$(KERNEL_DEF) -L$(NCCLIB) -lcudart + kernel_comp=integrand.cu.o +endif -run_cpu: $(target_cpu) - @python cuda_example.py +.PHONY: run clean -run_gpu: $(target_gpu) +run: $(target_lib) @python cuda_example.py -$(target_cpu): $(cpu_src) - @echo "Compiling CPU kernel" - @$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS) - -$(target_gpu): $(gpu_src) - @echo "Compiling GPU kernel" +%.cu.o: %.cu.cpp + @echo "[$(NCC)] Integrating cuda kernel..." @$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS) - @$(CXX) $(CXFLAGS) $(KERNEL) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS) + +%.so: %.cpp $(kernel_comp) + @echo "[$(CXX)] Integrating operator..." + @$(CXX) $(CXFLAGS) $(KERNEL) -o $@ $^ $(TF_CFLAGS) $(TF_LFLAGS) clean: - rm -f $(target_gpu) $(target_cpu) + rm -f $(target_lib) $(kernel_comp)