From 298ef0d69d8d8a226834833a283a6d9e84a75c23 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 14:05:15 +0200
Subject: [PATCH 1/7] use logging

---
 examples/simgauss_tf.py      |  2 +-
 src/vegasflow/configflow.py  | 20 +++++++++++++++++++-
 src/vegasflow/monte_carlo.py | 13 ++++++++-----
 src/vegasflow/vflow.py       |  9 ++++++---
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/examples/simgauss_tf.py b/examples/simgauss_tf.py
index 333262d..a6d8ed9 100644
--- a/examples/simgauss_tf.py
+++ b/examples/simgauss_tf.py
@@ -1,8 +1,8 @@
 # Place your function here
+from vegasflow.configflow import DTYPE, DTYPEINT
 import time
 import numpy as np
 import tensorflow as tf
-from vegasflow.configflow import DTYPE, DTYPEINT
 from vegasflow.vflow import vegas_wrapper
 from vegasflow.plain import plain_wrapper 
 
diff --git a/src/vegasflow/configflow.py b/src/vegasflow/configflow.py
index a390e7c..1684069 100644
--- a/src/vegasflow/configflow.py
+++ b/src/vegasflow/configflow.py
@@ -1,8 +1,26 @@
 """
 Define some constants, header style
 """
+import os
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
 # Most of this can be moved to a yaml file without loss of generality
 import tensorflow as tf
+# uncomment this line for debugging to avoid compiling any tf.function
+# tf.config.experimental_run_functions_eagerly(True)
+
+# Configure logging
+import logging
+
+module_name = __name__.split(".")[0]
+logger = logging.getLogger(module_name)
+# Set level debug for development
+logger.setLevel(logging.DEBUG)
+# Create a handler and format it
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.DEBUG)
+console_format = logging.Formatter("[%(levelname)s] %(message)s")
+console_handler.setFormatter(console_format)
+logger.addHandler(console_handler)
 
 # Define the tf.numberic types
 DTYPE = tf.float64
@@ -15,7 +33,7 @@
 # Set up the logistics of the integration
 # Events Limit limits how many events are done in one single run of the event_loop
 # set it lower if hitting memory problems
-MAX_EVENTS_LIMIT = int(1e7)
+MAX_EVENTS_LIMIT = int(1e6)
 # Select the list of devices to look for
 DEFAULT_ACTIVE_DEVICES = ["GPU"]  # , 'CPU']
 
diff --git a/src/vegasflow/monte_carlo.py b/src/vegasflow/monte_carlo.py
index 7f3968a..a2b5722 100644
--- a/src/vegasflow/monte_carlo.py
+++ b/src/vegasflow/monte_carlo.py
@@ -41,6 +41,9 @@
 import tensorflow as tf
 from vegasflow.configflow import MAX_EVENTS_LIMIT, DEFAULT_ACTIVE_DEVICES, DTYPE
 
+import logging
+logger = logging.getLogger(__name__)
+
 
 def print_iteration(it, res, error, extra="", threshold=0.1):
     """ Checks the size of the result to select between
@@ -48,9 +51,9 @@ def print_iteration(it, res, error, extra="", threshold=0.1):
     # note: actually, the flag 'g' does this automatically
     # but I prefer to choose the precision myself...
     if res < threshold:
-        print(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra)
+        logger.info(f"Result for iteration {it}: {res:.3e} +/- {error:.3e}" + extra)
     else:
-        print(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra)
+        logger.info(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra)
 
 def _accumulate(accumulators):
     """ Accumulate all the quantities in accumulators
@@ -135,8 +138,8 @@ def events_per_run(self, val):
         """ Set the number of events per single step """
         self._events_per_run = min(val, self.n_events)
         if self.n_events % self._events_per_run != 0:
-            print(
-                f"Warning, the number of events per run step {self._events_per_run} doesn't perfectly"
+            logger.warning(
+                f"The number of events per run step {self._events_per_run} doesn't perfectly"
                 f"divide the number of events {self.n_events}, which can harm performance"
             )
 
@@ -379,7 +382,7 @@ def run_integration(self, n_iter, log_time=True, histograms=None):
 
         final_result = aux_res / weight_sum
         sigma = np.sqrt(1.0 / weight_sum)
-        print(f" > Final results: {final_result.numpy():g} +/- {sigma:g}")
+        logger.info(f" > Final results: {final_result.numpy():g} +/- {sigma:g}")
         return final_result, sigma
 
 
diff --git a/src/vegasflow/vflow.py b/src/vegasflow/vflow.py
index 1b14572..9c9d758 100644
--- a/src/vegasflow/vflow.py
+++ b/src/vegasflow/vflow.py
@@ -14,6 +14,9 @@
 from vegasflow.monte_carlo import MonteCarloFlow, wrapper
 from vegasflow.utils import consume_array_into_indices
 
+import logging
+logger = logging.getLogger(__name__)
+
 FBINS = float_me(BINS_MAX)
 
 # Auxiliary functions for Vegas
@@ -232,8 +235,8 @@ def load_grid(self, file_name=None, numpy_grid=None):
                 integrand_name = self.integrand.__name__
                 integrand_grid = json_dict.get("integrand")
                 if integrand_name != integrand_grid:
-                    print(
-                        f"WARNING: The grid was written for the integrand: {integrand_grid}"
+                    logger.warning(
+                        f"The grid was written for the integrand: {integrand_grid}"
                         f"which is different from {integrand_name}"
                     )
             # Now that everything is clear, let's load up the grid
@@ -255,7 +258,7 @@ def load_grid(self, file_name=None, numpy_grid=None):
                 f"current settings is of {self.grid_bins} bins"
             )
         if file_name:
-            print(f" > SUCCESS: Loaded grid from {file_name}")
+            logger.info(f" > SUCCESS: Loaded grid from {file_name}")
         self.divisions.assign(numpy_grid)
 
     def refine_grid(self, arr_res2):

From 88af86e386f5809904255109d2210053e3ef646a Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 14:05:26 +0200
Subject: [PATCH 2/7] reformatting

---
 src/vegasflow/__init__.py    |  2 +-
 src/vegasflow/configflow.py  |  2 ++
 src/vegasflow/monte_carlo.py |  2 ++
 src/vegasflow/plain.py       |  4 +---
 src/vegasflow/vflow.py       | 14 ++++----------
 5 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/src/vegasflow/__init__.py b/src/vegasflow/__init__.py
index 40784c8..7b91e6c 100644
--- a/src/vegasflow/__init__.py
+++ b/src/vegasflow/__init__.py
@@ -1,3 +1,3 @@
 """Monte Carlo integration with Tensorflow"""
 
-__version__ = '1.0.2'
+__version__ = "1.0.2"
diff --git a/src/vegasflow/configflow.py b/src/vegasflow/configflow.py
index 1684069..89cbdca 100644
--- a/src/vegasflow/configflow.py
+++ b/src/vegasflow/configflow.py
@@ -2,9 +2,11 @@
 Define some constants, header style
 """
 import os
+
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
 # Most of this can be moved to a yaml file without loss of generality
 import tensorflow as tf
+
 # uncomment this line for debugging to avoid compiling any tf.function
 # tf.config.experimental_run_functions_eagerly(True)
 
diff --git a/src/vegasflow/monte_carlo.py b/src/vegasflow/monte_carlo.py
index a2b5722..046cb15 100644
--- a/src/vegasflow/monte_carlo.py
+++ b/src/vegasflow/monte_carlo.py
@@ -42,6 +42,7 @@
 from vegasflow.configflow import MAX_EVENTS_LIMIT, DEFAULT_ACTIVE_DEVICES, DTYPE
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 
@@ -55,6 +56,7 @@ def print_iteration(it, res, error, extra="", threshold=0.1):
     else:
         logger.info(f"Result for iteration {it}: {res:.4f} +/- {error:.4f}" + extra)
 
+
 def _accumulate(accumulators):
     """ Accumulate all the quantities in accumulators
     The default accumulation is implemented for tensorflow tensors
diff --git a/src/vegasflow/plain.py b/src/vegasflow/plain.py
index 4edb249..4bfda36 100644
--- a/src/vegasflow/plain.py
+++ b/src/vegasflow/plain.py
@@ -21,9 +21,7 @@ def _run_event(self, integrand, ncalls=None):
         # Jacobian
         xjac = 1.0 / self.n_events
         # Generate all random number for this iteration
-        rnds = tf.random.uniform(
-            (n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE
-        )
+        rnds = tf.random.uniform((n_events, self.n_dim), minval=0, maxval=1, dtype=DTYPE)
         # Compute the integrand
         tmp = integrand(rnds, n_dim=self.n_dim, weight=xjac) * xjac
         tmp2 = tf.square(tmp)
diff --git a/src/vegasflow/vflow.py b/src/vegasflow/vflow.py
index 9c9d758..773e937 100644
--- a/src/vegasflow/vflow.py
+++ b/src/vegasflow/vflow.py
@@ -15,6 +15,7 @@
 from vegasflow.utils import consume_array_into_indices
 
 import logging
+
 logger = logging.getLogger(__name__)
 
 FBINS = float_me(BINS_MAX)
@@ -138,10 +139,7 @@ def while_body(bin_weight, n_bin, cur, prev):
     prev = fzero
     for _ in range(BINS_MAX - 1):
         bin_weight, n_bin, cur, prev = tf.while_loop(
-            while_check,
-            while_body,
-            (bin_weight, n_bin, cur, prev),
-            parallel_iterations=1,
+            while_check, while_body, (bin_weight, n_bin, cur, prev), parallel_iterations=1,
         )
         bin_weight -= ave_t
         delta = (cur - prev) * bin_weight / wei_t[n_bin]
@@ -273,9 +271,7 @@ def refine_grid(self, arr_res2):
         Function not compiled
         """
         for j in range(self.n_dim):
-            new_divisions = refine_grid_per_dimension(
-                arr_res2[j, :], self.divisions[j, :]
-            )
+            new_divisions = refine_grid_per_dimension(arr_res2[j, :], self.divisions[j, :])
             self.divisions[j, :].assign(new_divisions)
 
     def _run_event(self, integrand, ncalls=None):
@@ -320,9 +316,7 @@ def _run_event(self, integrand, ncalls=None):
             # If the training is active, save the result of the integral sq
             for j in range(self.n_dim):
                 arr_res2.append(
-                    consume_array_into_indices(
-                        tmp2, ind[:, j : j + 1], self.grid_bins - 1
-                    )
+                    consume_array_into_indices(tmp2, ind[:, j : j + 1], self.grid_bins - 1)
                 )
             arr_res2 = tf.reshape(arr_res2, (self.n_dim, -1))
 

From 27c43f9ddf2d6fa7a76c20e09fcbd51bf8d7d98e Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 16:28:00 +0200
Subject: [PATCH 3/7] add the c++ integrand

---
 examples/cuda/cuda_example.py | 28 ++++++++++++++++++
 examples/cuda/integrand.cpp   | 55 +++++++++++++++++++++++++++++++++++
 examples/cuda/makefile        | 15 ++++++++++
 3 files changed, 98 insertions(+)
 create mode 100644 examples/cuda/cuda_example.py
 create mode 100644 examples/cuda/integrand.cpp
 create mode 100644 examples/cuda/makefile

diff --git a/examples/cuda/cuda_example.py b/examples/cuda/cuda_example.py
new file mode 100644
index 0000000..fad1537
--- /dev/null
+++ b/examples/cuda/cuda_example.py
@@ -0,0 +1,28 @@
+from vegasflow.configflow import DTYPE, DTYPEINT
+import time
+import numpy as np
+import tensorflow as tf
+from vegasflow.plain import plain_wrapper 
+
+# MC integration setup
+dim = 4
+ncalls = np.int32(1e4)
+n_iter = 5
+
+integrand_module = tf.load_op_library('./integrand.so')
+
+@tf.function
+def wrapper_integrand(xarr, **kwargs):
+    return integrand_module.integrand_op(xarr)
+
+@tf.function
+def fully_python_integrand(xarr, **kwargs):
+    return tf.reduce_sum(xarr, axis=1)
+
+if __name__ == "__main__":
+    print(f"VEGAS MC, ncalls={ncalls}:")
+    start = time.time()
+    ncalls = 10*ncalls
+    r = plain_wrapper(wrapper_integrand, dim, n_iter, ncalls)
+    end = time.time()
+    print(f"Vegas took: time (s): {end-start}")
diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp
new file mode 100644
index 0000000..09427b9
--- /dev/null
+++ b/examples/cuda/integrand.cpp
@@ -0,0 +1,55 @@
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+/* 
+ * In this example we follow the TF guide for operation creation
+ * https://www.tensorflow.org/guide/create_op
+ * to create an integrand as a custom operators.
+ *
+ * To first approximation, these operators are function that take
+ * a tensor and return a tensor.
+ */
+
+using namespace tensorflow;
+
+REGISTER_OP("IntegrandOp")
+.Input("xarr: double")
+.Output("ret: double")
+.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+        c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) );
+        return Status::OK();
+        });
+
+class IntegrandOp: public OpKernel {
+    public:
+        explicit IntegrandOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+        void Compute(OpKernelContext* context) override {
+            // Grab the input tensor
+            // the input tenjsor is expected to have a (nevents, ndim) shape
+            const Tensor& input_tensor = context->input(0);
+            auto input = input_tensor.tensor<double, 2>();
+            auto input_shape = input_tensor.shape();
+
+            // Create an output tensor
+            // the expected shape is (nevents,)
+            Tensor* output_tensor = NULL;
+            TensorShape output_shape;
+            const int N = input_shape.dim_size(0);
+            output_shape.AddDim(N);
+            OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
+                                                     &output_tensor));
+            auto output_flat = output_tensor->flat<double>();
+
+            // Sum in the dimensional axis
+            for (int i = 0; i < N; i++) {
+                output_flat(i) = 0.0;
+                for(int j = 0; j < input_shape.dim_size(1); j++) {
+                    output_flat(i) += input(i,j);
+                }
+            }
+        }
+};
+
+REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp);
diff --git a/examples/cuda/makefile b/examples/cuda/makefile
new file mode 100644
index 0000000..2ef2729
--- /dev/null
+++ b/examples/cuda/makefile
@@ -0,0 +1,15 @@
+target_cpu=integrand.so
+cpu_src=integrand.cpp
+
+TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null`
+TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null`
+
+CXX=g++
+CXFLAGS=-std=c++14 -shared -fPIC -O2
+
+run_cpu: $(target_cpu)
+	@python cuda_example.py
+
+$(target_cpu): $(cpu_src)
+	@echo "Compiling CPU kernel"
+	@$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS)

From 497310f5fe9bcdd88156a3942f0fbd849948853b Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 19:36:20 +0200
Subject: [PATCH 4/7] at least it compiles

---
 examples/cuda/cuda_example.py  |  2 +-
 examples/cuda/integrand.cpp    | 86 ++++++++++++++++++++++++----------
 examples/cuda/integrand.cu.cpp | 35 ++++++++++++++
 examples/cuda/integrand.h      | 21 +++++++++
 examples/cuda/makefile         | 21 ++++++++-
 5 files changed, 137 insertions(+), 28 deletions(-)
 create mode 100644 examples/cuda/integrand.cu.cpp
 create mode 100644 examples/cuda/integrand.h

diff --git a/examples/cuda/cuda_example.py b/examples/cuda/cuda_example.py
index fad1537..167ff71 100644
--- a/examples/cuda/cuda_example.py
+++ b/examples/cuda/cuda_example.py
@@ -6,7 +6,7 @@
 
 # MC integration setup
 dim = 4
-ncalls = np.int32(1e4)
+ncalls = np.int32(1e6)
 n_iter = 5
 
 integrand_module = tf.load_op_library('./integrand.so')
diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp
index 09427b9..a0d7056 100644
--- a/examples/cuda/integrand.cpp
+++ b/examples/cuda/integrand.cpp
@@ -1,6 +1,9 @@
+//#include "cuda_kernel.h"
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "integrand.h"
 
 /* 
  * In this example we follow the TF guide for operation creation
@@ -13,43 +16,74 @@
 
 using namespace tensorflow;
 
-REGISTER_OP("IntegrandOp")
-.Input("xarr: double")
-.Output("ret: double")
-.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
-        c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) );
-        return Status::OK();
-        });
+using GPUDevice = Eigen::GpuDevice;
+using CPUDevice = Eigen::ThreadPoolDevice;
 
+// CPU
+template <typename T>
+struct IntegrandOpFunctor<CPUDevice, T> {
+    void operator()(const CPUDevice &d, const T *input, T *output, const int nevents, const int dims) {
+        for (int i = 0; i < nevents; i++) {
+            output[i] = 0.0;
+            for(int j = 0; j < dims; j++) {
+                output[i] += input[i,j];
+            }
+        }
+    }
+};
+
+
+/* The input and output type must be coherent with the types used in tensorflow
+ * at this moment we are using float64 as default for vegasflow.
+ *
+ * The output shape is set to be (input_shape[0], ), i.e., number of events
+ */
+//REGISTER_OP("IntegrandOp")
+//    .Input("xarr: double")
+//    .Output("ret: double")
+//    .SetShapeFn([](shape_inference::InferenceContext* c) {
+//        c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) );
+//        return Status::OK();
+//    });
+
+template<typename Device, typename T>
 class IntegrandOp: public OpKernel {
     public:
-        explicit IntegrandOp(OpKernelConstruction* context) : OpKernel(context) {}
+        explicit IntegrandOp(OpKernelConstruction* context): OpKernel(context) {}
 
         void Compute(OpKernelContext* context) override {
-            // Grab the input tensor
-            // the input tenjsor is expected to have a (nevents, ndim) shape
+            // Grab input tensor, which is expected to be of shape (nevents, ndim)
             const Tensor& input_tensor = context->input(0);
-            auto input = input_tensor.tensor<double, 2>();
+            auto input = input_tensor.tensor<T, 2>().data();
             auto input_shape = input_tensor.shape();
 
-            // Create an output tensor
-            // the expected shape is (nevents,)
-            Tensor* output_tensor = NULL;
+            // Create an output tensor of shape (nevents,)
+            Tensor* output_tensor = nullptr;
             TensorShape output_shape;
             const int N = input_shape.dim_size(0);
+            const int dims = input_shape.dim_size(1);
             output_shape.AddDim(N);
-            OP_REQUIRES_OK(context, context->allocate_output(0, output_shape,
-                                                     &output_tensor));
-            auto output_flat = output_tensor->flat<double>();
-
-            // Sum in the dimensional axis
-            for (int i = 0; i < N; i++) {
-                output_flat(i) = 0.0;
-                for(int j = 0; j < input_shape.dim_size(1); j++) {
-                    output_flat(i) += input(i,j);
-                }
-            }
+            OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output_tensor));
+
+            auto output_flat = output_tensor->flat<T>().data();
+
+            // Perform the actual computation
+            IntegrandOpFunctor<Device, T>()(
+                    context->eigen_device<Device>(), input, output_flat, N, dims
+            );
         }
 };
 
-REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp);
+// Register the CPU version of the kernel
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU).TypeConstraint<T>("T"), IntegrandOp<CPUDevice, T>);
+REGISTER_CPU(double);
+
+// Register the GPU version
+#ifdef KERNEL_CUDA
+#define REGISTER_GPU(T) \
+  /* Declare explicit instantiations in kernel_example.cu.cc. */ \
+  extern template class ExampleFunctor<GPUDevice, T>;            \
+  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU).TypeConstraint<T>("T"),IntegrandOp<GPUDevice, T>);
+REGISTER_GPU(double);
+#endif
diff --git a/examples/cuda/integrand.cu.cpp b/examples/cuda/integrand.cu.cpp
new file mode 100644
index 0000000..f977503
--- /dev/null
+++ b/examples/cuda/integrand.cu.cpp
@@ -0,0 +1,35 @@
+#if KERNEL_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "integrand.h"
+
+using namespace tensorflow;
+using GPUDevice = Eigen::GpuDevice;
+
+// This is the kernel that does the actual computation on device
+template<typename T>
+__global__ void IntegrandOpKernel(const T *input, T *output, const int nevents, const int ndim) {
+    const auto gid = blockIdx.x*blockDim.x + threadIdx.x;
+    // note: this an example of usage, not an example of a very optimal anything...
+    for (int i = gid; i < nevents; i += blockDim.x*gridDim.x) {
+        output[i] = 2.0;
+        for (int j = 0; j < ndim; j++) {
+            output[i] += input[i,j];
+        }
+    }
+}
+
+// But it still needs to be launched from within C++
+// this bit is to be compared with the functor at the top of integrand.cpp 
+template <typename T>
+void IntegrandOpFunctor<GPUDevice, T>::operator()(const GPUDevice &d, const T *input, T *output, const int nevents, const int dims) {
+    const int block_count = 1024;
+    const int thread_per_block = 20;
+    IntegrandOpKernel<T><<<block_count, thread_per_block, 0, d.stream()>>>(input, output, nevents, dims);
+}
+
+template struct IntegrandOpFunctor<GPUDevice, double>;
+
+
+#endif
diff --git a/examples/cuda/integrand.h b/examples/cuda/integrand.h
new file mode 100644
index 0000000..026d030
--- /dev/null
+++ b/examples/cuda/integrand.h
@@ -0,0 +1,21 @@
+#ifndef KERNEL_INTEGRAND_
+#define KERNEL_INTEGRAND_
+
+namespace tensorflow {
+    using Eigen::GpuDevice;
+
+    template<typename Device, typename T>
+    struct IntegrandOpFunctor {
+        void operator()(const Device &d, const T *input, T *output, const int nevents, const int dims);
+    };
+
+#if KERNEL_CUDA
+    template<typename T>
+    struct IntegrandOpFunctor<Eigen::GpuDevice, T> {
+        void operator()(const Eigen::GpuDevice &d, const T *input, T *output, const int nevents, const int dims);
+    };
+#endif
+
+}
+
+#endif
diff --git a/examples/cuda/makefile b/examples/cuda/makefile
index 2ef2729..8fdb236 100644
--- a/examples/cuda/makefile
+++ b/examples/cuda/makefile
@@ -1,15 +1,34 @@
 target_cpu=integrand.so
 cpu_src=integrand.cpp
 
+target_gpu=integrand.cu.o
+gpu_src=integrand.cu.cpp
+
 TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null`
 TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null`
 
 CXX=g++
-CXFLAGS=-std=c++14 -shared -fPIC -O2
+NCC=nvcc
+CXFLAGS=-std=c++11 -shared -fPIC -O2
+NCCFLAGS=-std=c++11 -D KERNEL_CUDA=1 -x cu -Xcompiler -fPIC
+NLFLAGS=-L/opt/cuda/lib64 -lcudart
+
+.PHONY: run_cpu run_gpu clean
 
 run_cpu: $(target_cpu)
 	@python cuda_example.py
 
+run_gpu: $(target_gpu) $(target_cpu)
+	@python cuda_example.py
+
 $(target_cpu): $(cpu_src)
 	@echo "Compiling CPU kernel"
 	@$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS)
+
+$(target_gpu): $(gpu_src)
+	@echo "Compiling GPU kernel"
+	@$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS)
+	@$(CXX) $(CXFLAGS) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS)
+
+clean:
+	rm -f $(target_gpu) $(target_gpu) integrand.so

From bf48a9f0b7c5afa85ea408c46a60b03324033686 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 19:38:20 +0200
Subject: [PATCH 5/7] now it runs

---
 examples/cuda/integrand.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp
index a0d7056..28916f0 100644
--- a/examples/cuda/integrand.cpp
+++ b/examples/cuda/integrand.cpp
@@ -38,13 +38,13 @@ struct IntegrandOpFunctor<CPUDevice, T> {
  *
  * The output shape is set to be (input_shape[0], ), i.e., number of events
  */
-//REGISTER_OP("IntegrandOp")
-//    .Input("xarr: double")
-//    .Output("ret: double")
-//    .SetShapeFn([](shape_inference::InferenceContext* c) {
-//        c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) );
-//        return Status::OK();
-//    });
+REGISTER_OP("IntegrandOp")
+    .Input("xarr: double")
+    .Output("ret: double")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+        c -> set_output(0, c -> MakeShape( { c -> Dim(c -> input(0), 0) } ) );
+        return Status::OK();
+    });
 
 template<typename Device, typename T>
 class IntegrandOp: public OpKernel {
@@ -76,7 +76,7 @@ class IntegrandOp: public OpKernel {
 
 // Register the CPU version of the kernel
 #define REGISTER_CPU(T)                                          \
-  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU).TypeConstraint<T>("T"), IntegrandOp<CPUDevice, T>);
+  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_CPU), IntegrandOp<CPUDevice, T>);
 REGISTER_CPU(double);
 
 // Register the GPU version
@@ -84,6 +84,6 @@ REGISTER_CPU(double);
 #define REGISTER_GPU(T) \
   /* Declare explicit instantiations in kernel_example.cu.cc. */ \
   extern template class ExampleFunctor<GPUDevice, T>;            \
-  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU).TypeConstraint<T>("T"),IntegrandOp<GPUDevice, T>);
+  REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU),IntegrandOp<GPUDevice, T>);
 REGISTER_GPU(double);
 #endif

From 2315f17b974537009e80cd242a1c77f2cb857186 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Mon, 27 Jul 2020 19:46:46 +0200
Subject: [PATCH 6/7] fix copy and paste ...

---
 examples/cuda/integrand.cpp    | 2 +-
 examples/cuda/integrand.cu.cpp | 2 +-
 examples/cuda/makefile         | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/cuda/integrand.cpp b/examples/cuda/integrand.cpp
index 28916f0..f2e436c 100644
--- a/examples/cuda/integrand.cpp
+++ b/examples/cuda/integrand.cpp
@@ -83,7 +83,7 @@ REGISTER_CPU(double);
 #ifdef KERNEL_CUDA
 #define REGISTER_GPU(T) \
   /* Declare explicit instantiations in kernel_example.cu.cc. */ \
-  extern template class ExampleFunctor<GPUDevice, T>;            \
+  extern template class IntegrandOpFunctor<GPUDevice, T>;            \
   REGISTER_KERNEL_BUILDER(Name("IntegrandOp").Device(DEVICE_GPU),IntegrandOp<GPUDevice, T>);
 REGISTER_GPU(double);
 #endif
diff --git a/examples/cuda/integrand.cu.cpp b/examples/cuda/integrand.cu.cpp
index f977503..d4c61c7 100644
--- a/examples/cuda/integrand.cu.cpp
+++ b/examples/cuda/integrand.cu.cpp
@@ -13,7 +13,7 @@ __global__ void IntegrandOpKernel(const T *input, T *output, const int nevents,
     const auto gid = blockIdx.x*blockDim.x + threadIdx.x;
     // note: this an example of usage, not an example of a very optimal anything...
     for (int i = gid; i < nevents; i += blockDim.x*gridDim.x) {
-        output[i] = 2.0;
+        output[i] = 0.0;
         for (int j = 0; j < ndim; j++) {
             output[i] += input[i,j];
         }
diff --git a/examples/cuda/makefile b/examples/cuda/makefile
index 8fdb236..2d19d2e 100644
--- a/examples/cuda/makefile
+++ b/examples/cuda/makefile
@@ -10,7 +10,8 @@ TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_l
 CXX=g++
 NCC=nvcc
 CXFLAGS=-std=c++11 -shared -fPIC -O2
-NCCFLAGS=-std=c++11 -D KERNEL_CUDA=1 -x cu -Xcompiler -fPIC
+KERNEL=-D KERNEL_CUDA=1
+NCCFLAGS=-std=c++11 $(KERNEL) -x cu -Xcompiler -fPIC
 NLFLAGS=-L/opt/cuda/lib64 -lcudart
 
 .PHONY: run_cpu run_gpu clean
@@ -18,7 +19,7 @@ NLFLAGS=-L/opt/cuda/lib64 -lcudart
 run_cpu: $(target_cpu)
 	@python cuda_example.py
 
-run_gpu: $(target_gpu) $(target_cpu)
+run_gpu: $(target_gpu)
 	@python cuda_example.py
 
 $(target_cpu): $(cpu_src)
@@ -28,7 +29,7 @@ $(target_cpu): $(cpu_src)
 $(target_gpu): $(gpu_src)
 	@echo "Compiling GPU kernel"
 	@$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS)
-	@$(CXX) $(CXFLAGS) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS)
+	@$(CXX) $(CXFLAGS) $(KERNEL) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS)
 
 clean:
-	rm -f $(target_gpu) $(target_gpu) integrand.so
+	rm -f $(target_gpu) $(target_cpu)

From 92ef4c0aa5112814bf35be2d0b11dbfe5b8d3997 Mon Sep 17 00:00:00 2001
From: juacrumar <juacrumar@gmail.com>
Date: Tue, 28 Jul 2020 09:54:48 +0200
Subject: [PATCH 7/7] clean makefile

---
 examples/cuda/makefile | 41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/examples/cuda/makefile b/examples/cuda/makefile
index 2d19d2e..3afd78e 100644
--- a/examples/cuda/makefile
+++ b/examples/cuda/makefile
@@ -1,35 +1,34 @@
-target_cpu=integrand.so
-cpu_src=integrand.cpp
-
-target_gpu=integrand.cu.o
-gpu_src=integrand.cu.cpp
+target_lib=integrand.so
 
 TF_CFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))' 2> /dev/null`
 TF_LFLAGS=`python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))' 2>/dev/null`
 
 CXX=g++
-NCC=nvcc
 CXFLAGS=-std=c++11 -shared -fPIC -O2
-KERNEL=-D KERNEL_CUDA=1
-NCCFLAGS=-std=c++11 $(KERNEL) -x cu -Xcompiler -fPIC
-NLFLAGS=-L/opt/cuda/lib64 -lcudart
+KERNEL_DEF=-D KERNEL_CUDA=1
+NCCFLAGS=-std=c++11 $(KERNEL_DEF) -x cu -Xcompiler -fPIC --disable-warnings
 
-.PHONY: run_cpu run_gpu clean
+# Check whether there's nvcc
+ifeq (,$(shell which nvcc 2>/dev/null))
+else
+	NCC:=nvcc
+	NCCLIB:=$(subst bin/nvcc,lib64, $(shell which nvcc))
+	CXFLAGS+=$(KERNEL_DEF) -L$(NCCLIB) -lcudart
+	kernel_comp=integrand.cu.o
+endif
 
-run_cpu: $(target_cpu)
-	@python cuda_example.py
+.PHONY: run clean
 
-run_gpu: $(target_gpu)
+run: $(target_lib)
 	@python cuda_example.py
 
-$(target_cpu): $(cpu_src)
-	@echo "Compiling CPU kernel"
-	@$(CXX) $(CXFLAGS) $< -o $@ -fPIC $(TF_CFLAGS) $(TF_LFLAGS)
-
-$(target_gpu): $(gpu_src)
-	@echo "Compiling GPU kernel"
+%.cu.o: %.cu.cpp
+	@echo "[$(NCC)] Integrating cuda kernel..."
 	@$(NCC) $(NCCFLAGS) -c -o $@ $< $(TF_CFLAGS)
-	@$(CXX) $(CXFLAGS) $(KERNEL) -o $(target_cpu) $(cpu_src) $(target_gpu) $(TF_CFLAGS) $(NLFLAGS) $(TF_LFLAGS)
+
+%.so: %.cpp $(kernel_comp)
+	@echo "[$(CXX)] Integrating operator..."
+	@$(CXX) $(CXFLAGS) $(KERNEL) -o $@ $^ $(TF_CFLAGS) $(TF_LFLAGS)
 
 clean:
-	rm -f $(target_gpu) $(target_cpu)
+	rm -f $(target_lib) $(kernel_comp)