diff --git a/examples/cuda/cuda_example.py b/examples/cuda/cuda_example.py new file mode 100644 index 0000000..167ff71 --- /dev/null +++ b/examples/cuda/cuda_example.py @@ -0,0 +1,28 @@ +from vegasflow.configflow import DTYPE, DTYPEINT +import time +import numpy as np +import tensorflow as tf +from vegasflow.plain import plain_wrapper + +# MC integration setup +dim = 4 +ncalls = np.int32(1e6) +n_iter = 5 + +integrand_module = tf.load_op_library('./integrand.so') + +@tf.function +def wrapper_integrand(xarr, **kwargs): + return integrand_module.integrand_op(xarr) + +@tf.function +def fully_python_integrand(xarr, **kwargs): + return tf.reduce_sum(xarr, axis=1) + +if __name__ == "__main__": + print(f"VEGAS MC, ncalls={ncalls}:") + start = time.time() + ncalls = 10*ncalls + r = plain_wrapper(wrapper_integrand, dim, n_iter, ncalls) + end = time.time() + print(f"Vegas took: time (s): {end-start}") diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp new file mode 100644 index 0000000..f2e436c --- /dev/null +++ b/examples/cuda/integrand.cpp @@ -0,0 +1,89 @@ +//#include "cuda_kernel.h" + +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "integrand.h" + +/* + * In this example we follow the TF guide for operation creation + * https://www.tensorflow.org/guide/create_op + * to create an integrand as a custom operators. + * + * To first approximation, these operators are function that take + * a tensor and return a tensor. + */ + +using namespace tensorflow; + +using GPUDevice = Eigen::GpuDevice; +using CPUDevice = Eigen::ThreadPoolDevice; + +// CPU +template +struct IntegrandOpFunctor { + void operator()(const CPUDevice &d, const T *input, T *output, const int nevents, const int dims) { + for (int i = 0; i < nevents; i++) { + output[i] = 0.0; + for(int j = 0; j < dims; j++) { + output[i] += input[i,j]; + } + } + } +}; + + +/* The input and output type must be coherent with the types used in tensorflow + * at this moment we are using float64 as default for vegasflow. + * + * The output shape is set to be (input_shape[0], ), i.e., number of events + */ +REGISTER_OP("IntegrandOp") + .Input("xarr: double") + .Output("ret: double") + .SetShapeFn([](shape_inference::InferenceContext* c) { + c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) ); + return Status::OK(); + }); + +template +class IntegrandOp: public OpKernel { + public: + explicit IntegrandOp(OpKernelConstruction* context): OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // Grab input tensor, which is expected to be of shape (nevents, ndim) + const Tensor& input_tensor = context->input(0); + auto input = input_tensor.tensor().data(); + auto input_shape = input_tensor.shape(); + + // Create an output tensor of shape (nevents,) + Tensor* output_tensor = nullptr; + TensorShape output_shape; + const int N = input_shape.dim_size(0); + const int dims = input_shape.dim_size(1); + output_shape.AddDim(N); + OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor)); + + auto output_flat = output_tensor->flat().data(); + + // Perform the actual computation + IntegrandOpFunctor()( + context->eigen_device(), input, output_flat, N, dims + ); + } +}; + +// Register the CPU version of the kernel +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp); +REGISTER_CPU(double); + +// Register the GPU version +#ifdef KERNEL_CUDA +#define REGISTER_GPU(T) \ + /* Declare explicit instantiations in kernel_example.cu.cc. */ \ + extern template class IntegrandOpFunctor; \ + REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU),IntegrandOp); +REGISTER_GPU(double); +#endif diff --git a/examples/cuda/integrand.cu.cpp b/examples/cuda/integrand.cu.cpp new file mode 100644 index 0000000..d4c61c7 --- /dev/null +++ b/examples/cuda/integrand.cu.cpp @@ -0,0 +1,35 @@ +#if KERNEL_CUDA +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/op_kernel.h" +#include "integrand.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + +// This is the kernel that does the actual computation on device +template +__global__ void IntegrandOpKernel(const T *input, T *output, const int nevents, const int ndim) { + const auto gid = blockIdx.x*blockDim.x + threadIdx.x; + // note: this an example of usage, not an example of a very optimal anything... + for (int i = gid; i < nevents; i += blockDim.x*gridDim.x) { + output[i] = 0.0; + for (int j = 0; j < ndim; j++) { + output[i] += input[i,j]; + } + } +} + +// But it still needs to be launched from within C++ +// this bit is to be compared with the functor at the top of integrand.cpp +template +void IntegrandOpFunctor::operator()(const GPUDevice &d, const T *input, T *output, const int nevents, const int dims) { + const int block_count = 1024; + const int thread_per_block = 20; + IntegrandOpKernel<<>>(input, output, nevents, dims); +} + +template struct IntegrandOpFunctor; + + +#endif diff --git a/examples/cuda/integrand.h b/examples/cuda/integrand.h new file mode 100644 index 0000000..026d030 --- /dev/null +++ b/examples/cuda/integrand.h @@ -0,0 +1,21 @@ +#ifndef KERNEL_INTEGRAND_ +#define KERNEL_INTEGRAND_ + +namespace tensorflow { + using Eigen::GpuDevice; + + template + struct IntegrandOpFunctor { + void operator()(const Device &d, const T *input, T *output, const int nevents, const int dims); + }; + +#if KERNEL_CUDA + template + struct IntegrandOpFunctor { + void operator()(const Eigen::GpuDevice &d, const T *input, T *output, const int nevents, const int dims); + }; +#endif + +} + +#endif diff --git a/examples/cuda/makefile b/examples/cuda/makefile new file mode 100644 index 0000000..3afd78e --- /dev/null +++ b/examples/cuda/makefile @@ -0,0 +1,34 @@ +target_lib=integrand.so + +TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null` +TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null` + +CXX=g++ +CXFLAGS=-std=c++11 -shared -fPIC -O2 +KERNEL_DEF=-D KERNEL_CUDA=1 +NCCFLAGS=-std=c++11 $(KERNEL_DEF) -x cu -Xcompiler -fPIC --disable-warnings + +# Check whether there's nvcc +ifeq (,$(shell which nvcc 2>/dev/null)) +else + NCC:=nvcc + NCCLIB:=$(subst bin/nvcc,lib64, $(shell which nvcc)) + CXFLAGS+=$(KERNEL_DEF) -L$(NCCLIB) -lcudart + kernel_comp=integrand.cu.o +endif + +.PHONY: run clean + +run: $(target_lib) + @python cuda_example.py + +%.cu.o: %.cu.cpp + @echo "[$(NCC)] Integrating cuda kernel..." + @$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS) + +%.so: %.cpp $(kernel_comp) + @echo "[$(CXX)] Integrating operator..." + @$(CXX) $(CXFLAGS) $(KERNEL) -o $@ $^ $(TF_CFLAGS) $(TF_LFLAGS) + +clean: + rm -f $(target_lib) $(kernel_comp) diff --git a/examples/simgauss_tf.py b/examples/simgauss_tf.py index 333262d..a6d8ed9 100644 --- a/examples/simgauss_tf.py +++ b/examples/simgauss_tf.py @@ -1,8 +1,8 @@ # Place your function here +from vegasflow.configflow import DTYPE, DTYPEINT import time import numpy as np import tensorflow as tf -from vegasflow.configflow import DTYPE, DTYPEINT from vegasflow.vflow import vegas_wrapper from vegasflow.plain import plain_wrapper diff --git a/src/vegasflow/__init__.py b/src/vegasflow/__init__.py index 40784c8..7b91e6c 100644 --- a/src/vegasflow/__init__.py +++ b/src/vegasflow/__init__.py @@ -1,3 +1,3 @@ """Monte Carlo integration with Tensorflow""" -__version__ = '1.0.2' +__version__ = "1.0.2" diff --git a/src/vegasflow/configflow.py b/src/vegasflow/configflow.py index a390e7c..89cbdca 100644 --- a/src/vegasflow/configflow.py +++ b/src/vegasflow/configflow.py @@ -1,9 +1,29 @@ """ Define some constants, header style """ +import os + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1" # Most of this can be moved to a yaml file without loss of generality import tensorflow as tf +# uncomment this line for debugging to avoid compiling any tf.function +# tf.config.experimental_run_functions_eagerly(True) + +# Configure logging +import logging + +module_name = __name__.split(".")[0] +logger = logging.getLogger(module_name) +# Set level debug for development +logger.setLevel(logging.DEBUG) +# Create a handler and format it +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.DEBUG) +console_format = logging.Formatter("[%(levelname)s] %(message)s") +console_handler.setFormatter(console_format) +logger.addHandler(console_handler) + # Define the tf.numberic types DTYPE = tf.float64 DTYPEINT = tf.int32 @@ -15,7 +35,7 @@ # Set up the logistics of the integration # Events Limit limits how many events are done in one single run of the event_loop # set it lower if hitting memory problems -MAX_EVENTS_LIMIT = int(1e7) +MAX_EVENTS_LIMIT = int(1e6) # Select the list of devices to look for DEFAULT_ACTIVE_DEVICES = ["GPU"] # , 'CPU'] diff --git a/src/vegasflow/monte_carlo.py b/src/vegasflow/monte_carlo.py index 7f3968a..046cb15 100644 --- a/src/vegasflow/monte_carlo.py +++ b/src/vegasflow/monte_carlo.py @@ -41,6 +41,10 @@ import tensorflow as tf from vegasflow.configflow import MAX_EVENTS_LIMIT, DEFAULT_ACTIVE_DEVICES, DTYPE +import logging + +logger = logging.getLogger(__name__) + def print_iteration(it, res, error, extra="", threshold=0.1): """ Checks the size of the result to select between @@ -48,9 +52,10 @@ def print_iteration(it, res, error, extra="", threshold=0.1): # note: actually, the flag 'g' does this automatically # but I prefer to choose the precision myself... if res < threshold: - print(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra) + logger.info(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra) else: - print(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra) + logger.info(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra) + def _accumulate(accumulators): """ Accumulate all the quantities in accumulators @@ -135,8 +140,8 @@ def events_per_run(self, val): """ Set the number of events per single step """ self._events_per_run = min(val, self.n_events) if self.n_events % self._events_per_run != 0: - print( - f"Warning, the number of events per run step {self._events_per_run} doesn't perfectly" + logger.warning( + f"The number of events per run step {self._events_per_run} doesn't perfectly" f"divide the number of events {self.n_events}, which can harm performance" ) @@ -379,7 +384,7 @@ def run_integration(self, n_iter, log_time=True, histograms=None): final_result = aux_res / weight_sum sigma = np.sqrt(1.0 / weight_sum) - print(f" > Final results: {final_result.numpy():g} +/- {sigma:g}") + logger.info(f" > Final results: {final_result.numpy():g} +/- {sigma:g}") return final_result, sigma diff --git a/src/vegasflow/plain.py b/src/vegasflow/plain.py index 4edb249..4bfda36 100644 --- a/src/vegasflow/plain.py +++ b/src/vegasflow/plain.py @@ -21,9 +21,7 @@ def _run_event(self, integrand, ncalls=None): # Jacobian xjac = 1.0 / self.n_events # Generate all random number for this iteration - rnds = tf.random.uniform( - (n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE - ) + rnds = tf.random.uniform((n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE) # Compute the integrand tmp = integrand(rnds, n_dim=self.n_dim, weight=xjac) * xjac tmp2 = tf.square(tmp) diff --git a/src/vegasflow/vflow.py b/src/vegasflow/vflow.py index 1b14572..773e937 100644 --- a/src/vegasflow/vflow.py +++ b/src/vegasflow/vflow.py @@ -14,6 +14,10 @@ from vegasflow.monte_carlo import MonteCarloFlow, wrapper from vegasflow.utils import consume_array_into_indices +import logging + +logger = logging.getLogger(__name__) + FBINS = float_me(BINS_MAX) # Auxiliary functions for Vegas @@ -135,10 +139,7 @@ def while_body(bin_weight, n_bin, cur, prev): prev = fzero for _ in range(BINS_MAX - 1): bin_weight, n_bin, cur, prev = tf.while_loop( - while_check, - while_body, - (bin_weight, n_bin, cur, prev), - parallel_iterations=1, + while_check, while_body, (bin_weight, n_bin, cur, prev), parallel_iterations=1, ) bin_weight -= ave_t delta = (cur - prev) * bin_weight / wei_t[n_bin] @@ -232,8 +233,8 @@ def load_grid(self, file_name=None, numpy_grid=None): integrand_name = self.integrand.__name__ integrand_grid = json_dict.get("integrand") if integrand_name != integrand_grid: - print( - f"WARNING: The grid was written for the integrand: {integrand_grid}" + logger.warning( + f"The grid was written for the integrand: {integrand_grid}" f"which is different from {integrand_name}" ) # Now that everything is clear, let's load up the grid @@ -255,7 +256,7 @@ def load_grid(self, file_name=None, numpy_grid=None): f"current settings is of {self.grid_bins} bins" ) if file_name: - print(f" > SUCCESS: Loaded grid from {file_name}") + logger.info(f" > SUCCESS: Loaded grid from {file_name}") self.divisions.assign(numpy_grid) def refine_grid(self, arr_res2): @@ -270,9 +271,7 @@ def refine_grid(self, arr_res2): Function not compiled """ for j in range(self.n_dim): - new_divisions = refine_grid_per_dimension( - arr_res2[j, :], self.divisions[j, :] - ) + new_divisions = refine_grid_per_dimension(arr_res2[j, :], self.divisions[j, :]) self.divisions[j, :].assign(new_divisions) def _run_event(self, integrand, ncalls=None): @@ -317,9 +316,7 @@ def _run_event(self, integrand, ncalls=None): # If the training is active, save the result of the integral sq for j in range(self.n_dim): arr_res2.append( - consume_array_into_indices( - tmp2, ind[:, j : j + 1], self.grid_bins - 1 - ) + consume_array_into_indices(tmp2, ind[:, j : j + 1], self.grid_bins - 1) ) arr_res2 = tf.reshape(arr_res2, (self.n_dim, -1))