From cc9c76cfc3efc422e8a998882bfde3dd2f541759 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 09:49:19 +0100
Subject: [PATCH 01/43] Overload =,+,*; consistent variable names

---
 src/ctorch.cpp  |  32 +++
 src/ctorch.h    |  25 ++
 src/ftorch.f90  | 741 +++++++++++++++++++++++++++---------------------
 src/ftorch.fypp | 120 ++++++--
 4 files changed, 569 insertions(+), 349 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 09edd0d7..ecddd188 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -191,6 +191,38 @@ void torch_tensor_delete(torch_tensor_t tensor)
   delete t;
 }
 
+torch_tensor_t torch_tensor_assign(const torch_tensor_t input)
+{
+    auto in = reinterpret_cast<torch::Tensor* const>(input);
+    torch::AutoGradMode enable_grad(in->requires_grad());
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = in->detach().clone();
+    return output;
+}
+
+torch_tensor_t torch_tensor_add(const torch_tensor_t tensor1,
+                                const torch_tensor_t tensor2)
+{
+    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
+    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = *t1 + *t2;
+    return output;
+}
+
+torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
+                                     const torch_tensor_t tensor2)
+{
+    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
+    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = *t1 * *t2;
+    return output;
+}
+
 torch_jit_script_module_t torch_jit_load(const char* filename,
                                          const torch_device_t device_type = torch_kCPU,
                                          const int device_index = -1,
diff --git a/src/ctorch.h b/src/ctorch.h
index cbc63c9b..bc40047c 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -110,6 +110,31 @@ EXPORT_C int torch_tensor_get_device_index(const torch_tensor_t tensor);
  */
 EXPORT_C void torch_tensor_delete(torch_tensor_t tensor);
 
+/**
+ * Overloads the assignment operator for Torch Tensor
+ * @param input Tensor
+ * @return copy of input Tensor
+ */
+EXPORT_C torch_tensor_t torch_tensor_assign(const torch_tensor_t input);
+
+/**
+ * Overloads the addition operator for two Torch Tensors
+ * @param first Tensor to be added
+ * @param second Tensor to be added
+ * @return sum of the Tensors
+ */
+EXPORT_C torch_tensor_t torch_tensor_add(const torch_tensor_t tensor1,
+                                         const torch_tensor_t tensor2);
+
+/**
+ * Overloads the multiplication operator for two Torch Tensors
+ * @param first Tensor to be multiplied
+ * @param second Tensor to be multiplied
+ * @return product of the Tensors
+ */
+EXPORT_C torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
+                                              const torch_tensor_t tensor2);
+
 
 // =====================================================================================
 // Module API
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 33b75a9f..570ad5ce 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -94,18 +94,31 @@ function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, &
     end function torch_from_blob_c
   end interface
 
+  interface assignment (=)
+    module procedure torch_tensor_assign
+  end interface
+
+  interface operator (+)
+    module procedure torch_tensor_add
+  end interface
+
+  interface operator (*)
+    module procedure torch_tensor_multiply
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_zeros(tensor, ndims, tensor_shape, dtype,            &
+                                device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
     logical(c_bool)                :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
@@ -139,18 +152,19 @@ end function torch_zeros_c
     end if
 
     tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_zeros
+  end subroutine torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_ones(tensor, ndims, tensor_shape, dtype,             &
+                               device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
     logical(c_bool)                :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
@@ -184,15 +198,16 @@ end function torch_ones_c
     end if
 
     tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_ones
+  end subroutine torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype,    &
-                                  device_type, device_index,                   &
-                                  requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_blob(tensor, data, ndims, tensor_shape, layout, &
+                                    dtype, device_type, device_index,          &
+                                    requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t, c_ptr
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
@@ -201,7 +216,6 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype,    &
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
@@ -229,7 +243,7 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype,    &
     endif
 
     tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_from_blob
+  end subroutine torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
   subroutine torch_tensor_print(tensor)
@@ -249,7 +263,7 @@ end subroutine torch_tensor_print
   !> Determines the device index of a tensor.
   function torch_tensor_get_device_index(tensor) result(device_index)
     use, intrinsic :: iso_c_binding, only : c_int
-    type(torch_tensor), intent(in) :: tensor  !! Input tensor
+    type(torch_tensor), value, intent(in) :: tensor !! Input tensor
     integer(c_int) :: device_index  !! Device index of tensor
 
     interface
@@ -279,6 +293,61 @@ end subroutine torch_tensor_delete_c
     call torch_tensor_delete_c(tensor%p)
   end subroutine torch_tensor_delete
 
+  !> Overloads assignment operator for tensors.
+  subroutine torch_tensor_assign(output, input)
+    type(torch_tensor), intent(out) :: output
+    type(torch_tensor), intent(in) :: input
+
+    interface
+      function torch_tensor_assign_c(input_c) result(output_c)                 &
+          bind(c, name = 'torch_tensor_assign')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: input_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_assign_c
+    end interface
+
+    output%p = torch_tensor_assign_c(input%p)
+  end subroutine torch_tensor_assign
+
+  !> Overloads addition operator for two tensors.
+  function torch_tensor_add(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_add_c(tensor1_c, tensor2_c) result(output_c)       &
+          bind(c, name = 'torch_tensor_add')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_add_c
+    end interface
+
+    output%p = torch_tensor_add_c(tensor1%p, tensor2%p)
+  end function torch_tensor_add
+
+  !> Overloads multiplication operator for two tensors.
+  function torch_tensor_multiply(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_multiply_c(tensor1_c, tensor2_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_multiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_multiply_c
+    end interface
+
+    output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
+  end function torch_tensor_multiply
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)
@@ -398,23 +467,24 @@ end subroutine torch_jit_module_delete_c
   end subroutine torch_module_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
-  function torch_tensor_from_array_int8_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int8_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -424,7 +494,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -436,38 +506,39 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device_type, device_
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int8_1d
+  end subroutine torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
-  function torch_tensor_from_array_int8_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int8_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -477,7 +548,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -489,38 +560,39 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device_type, device_
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int8_2d
+  end subroutine torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
-  function torch_tensor_from_array_int8_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int8_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -530,7 +602,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -542,38 +614,39 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device_type, device_
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int8_3d
+  end subroutine torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
-  function torch_tensor_from_array_int8_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int8_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -583,7 +656,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -595,38 +668,39 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device_type, device_
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int8_4d
+  end subroutine torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
-  function torch_tensor_from_array_int16_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int16_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -636,7 +710,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -648,38 +722,39 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int16_1d
+  end subroutine torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
-  function torch_tensor_from_array_int16_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int16_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -689,7 +764,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -701,38 +776,39 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int16_2d
+  end subroutine torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
-  function torch_tensor_from_array_int16_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int16_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -742,7 +818,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -754,38 +830,39 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int16_3d
+  end subroutine torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
-  function torch_tensor_from_array_int16_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int16_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -795,7 +872,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -807,38 +884,39 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int16_4d
+  end subroutine torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
-  function torch_tensor_from_array_int32_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int32_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -848,7 +926,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -860,38 +938,39 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int32_1d
+  end subroutine torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
-  function torch_tensor_from_array_int32_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int32_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -901,7 +980,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -913,38 +992,39 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int32_2d
+  end subroutine torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
-  function torch_tensor_from_array_int32_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int32_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -954,7 +1034,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -966,38 +1046,39 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int32_3d
+  end subroutine torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
-  function torch_tensor_from_array_int32_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int32_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -1007,7 +1088,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1019,38 +1100,39 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int32_4d
+  end subroutine torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
-  function torch_tensor_from_array_int64_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int64_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -1060,7 +1142,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1072,38 +1154,39 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int64_1d
+  end subroutine torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
-  function torch_tensor_from_array_int64_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int64_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -1113,7 +1196,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1125,38 +1208,39 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int64_2d
+  end subroutine torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
-  function torch_tensor_from_array_int64_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int64_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -1166,7 +1250,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1178,38 +1262,39 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int64_3d
+  end subroutine torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
-  function torch_tensor_from_array_int64_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_int64_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -1219,7 +1304,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1231,38 +1316,39 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device_type, device
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_int64_4d
+  end subroutine torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
-  function torch_tensor_from_array_real32_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real32_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -1272,7 +1358,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1284,38 +1370,39 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real32_1d
+  end subroutine torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
-  function torch_tensor_from_array_real32_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real32_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -1325,7 +1412,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1337,38 +1424,39 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real32_2d
+  end subroutine torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
-  function torch_tensor_from_array_real32_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real32_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -1378,7 +1466,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1390,38 +1478,39 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real32_3d
+  end subroutine torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
-  function torch_tensor_from_array_real32_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real32_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -1431,7 +1520,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1443,38 +1532,39 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real32_4d
+  end subroutine torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
-  function torch_tensor_from_array_real64_1d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real64_1d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64 !! Data type
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
@@ -1484,7 +1574,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1496,38 +1586,39 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real64_1d
+  end subroutine torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
-  function torch_tensor_from_array_real64_2d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real64_2d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64 !! Data type
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
@@ -1537,7 +1628,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1549,38 +1640,39 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real64_2d
+  end subroutine torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
-  function torch_tensor_from_array_real64_3d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real64_3d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64 !! Data type
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
@@ -1590,7 +1682,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1602,38 +1694,39 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real64_3d
+  end subroutine torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
-  function torch_tensor_from_array_real64_4d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_real64_4d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer(c_int64_t)        :: tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64 !! Data type
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
@@ -1643,7 +1736,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -1655,19 +1748,19 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device_type, devic
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_real64_4d
+  end subroutine torch_tensor_from_array_real64_4d
 
 
 end module ftorch
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index c33fec08..07adec74 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -92,18 +92,31 @@ module ftorch
     end function torch_from_blob_c
   end interface
 
+  interface assignment (=)
+    module procedure torch_tensor_assign
+  end interface
+
+  interface operator (+)
+    module procedure torch_tensor_add
+  end interface
+
+  interface operator (*)
+    module procedure torch_tensor_multiply
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_zeros(tensor, ndims, tensor_shape, dtype,            &
+                                device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
     logical(c_bool)                :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
@@ -137,18 +150,19 @@ contains
     end if
 
     tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_zeros
+  end subroutine torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_ones(tensor, ndims, tensor_shape, dtype,             &
+                               device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
     logical(c_bool)                :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
@@ -182,15 +196,16 @@ contains
     end if
 
     tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_ones
+  end subroutine torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype,    &
-                                  device_type, device_index,                   &
-                                  requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_blob(tensor, data, ndims, tensor_shape, layout, &
+                                    dtype, device_type, device_index,          &
+                                    requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_int, c_int64_t, c_ptr
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
@@ -199,7 +214,6 @@ contains
     integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     logical(c_bool), optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
-    type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
@@ -227,7 +241,7 @@ contains
     endif
 
     tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index_value, requires_grad)
-  end function torch_tensor_from_blob
+  end subroutine torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
   subroutine torch_tensor_print(tensor)
@@ -247,7 +261,7 @@ contains
   !> Determines the device index of a tensor.
   function torch_tensor_get_device_index(tensor) result(device_index)
     use, intrinsic :: iso_c_binding, only : c_int
-    type(torch_tensor), intent(in) :: tensor  !! Input tensor
+    type(torch_tensor), value, intent(in) :: tensor !! Input tensor
     integer(c_int) :: device_index  !! Device index of tensor
 
     interface
@@ -277,6 +291,61 @@ contains
     call torch_tensor_delete_c(tensor%p)
   end subroutine torch_tensor_delete
 
+  !> Overloads assignment operator for tensors.
+  subroutine torch_tensor_assign(output, input)
+    type(torch_tensor), intent(out) :: output
+    type(torch_tensor), intent(in) :: input
+
+    interface
+      function torch_tensor_assign_c(input_c) result(output_c)                 &
+          bind(c, name = 'torch_tensor_assign')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: input_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_assign_c
+    end interface
+
+    output%p = torch_tensor_assign_c(input%p)
+  end subroutine torch_tensor_assign
+
+  !> Overloads addition operator for two tensors.
+  function torch_tensor_add(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_add_c(tensor1_c, tensor2_c) result(output_c)       &
+          bind(c, name = 'torch_tensor_add')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_add_c
+    end interface
+
+    output%p = torch_tensor_add_c(tensor1%p, tensor2%p)
+  end function torch_tensor_add
+
+  !> Overloads multiplication operator for two tensors.
+  function torch_tensor_multiply(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_multiply_c(tensor1_c, tensor2_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_multiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_multiply_c
+    end interface
+
+    output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
+  end function torch_tensor_multiply
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)
@@ -398,23 +467,24 @@ contains
   #:for PREC in PRECISIONS
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
-  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device_type, device_index, requires_grad_opt) result(tensor)
+  subroutine torch_tensor_from_array_${PREC}$_${RANK}$d(tensor, data_in,       &
+               layout, device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 
+    ! output tensory
+    type(torch_tensor), intent(out) :: tensor     !! Returned tensor
+
     ! inputs
     ${f_type(PREC)}$(kind=${PREC}$), intent(in), target :: data_in${ranksuffix(RANK)}$   !! Input data that tensor will point at
     integer, intent(in)        :: layout(${RANK}$) !! Control order of indices
-    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
     logical, optional, intent(in) :: requires_grad_opt  !! Whether gradients need to be computed for the created tensor
 
-    ! output tensory
-    type(torch_tensor) :: tensor     !! Returned tensor
-
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(${RANK}$)           !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$ !! Data type
+    integer(c_int64_t)        :: tensor_shape(${RANK}$)           !! Shape of the tensor
+    integer(c_int), parameter :: dtype = ${enum_from_prec(PREC)}$ !! Data type
     integer(c_int64_t)        :: strides(${RANK}$)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = ${RANK}$                   !! Number of dimension of input data
     integer                   :: i
@@ -424,7 +494,7 @@ contains
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
-    else if (c_device_type == torch_kCPU) then
+    else if (device_type == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
@@ -436,19 +506,19 @@ contains
       requires_grad = requires_grad_opt
     end if
 
-    c_tensor_shape = shape(data_in)
+    tensor_shape = shape(data_in)
 
     strides(layout(1)) = 1
     do i = 2, ndims
-      strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
+      strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape,        &
-                                 strides, c_dtype, c_device_type,              &
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, tensor_shape,          &
+                                 strides, dtype, device_type,                  &
                                  device_index_value,                           &
                                  logical(requires_grad, c_bool))
 
-  end function torch_tensor_from_array_${PREC}$_${RANK}$d
+  end subroutine torch_tensor_from_array_${PREC}$_${RANK}$d
 
   #:endfor
   #:endfor

From 366ffda54b3f775e4f22c1c751608155fb27bc9f Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 09:49:37 +0100
Subject: [PATCH 02/43] Update existing examples

---
 examples/1_SimpleNet/simplenet_infer_fortran.f90 | 4 ++--
 examples/2_ResNet18/resnet_infer_fortran.f90     | 4 ++--
 examples/3_MultiGPU/simplenet_infer_fortran.f90  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index f08f178b..d31c73b6 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -37,8 +37,8 @@ program inference
    in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
 
    ! Create Torch input/output tensors from the above arrays
-   in_tensors(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU)
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+   call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout, torch_kCPU)
+   call torch_tensor_from_array(out_tensor, out_data, tensor_layout, torch_kCPU)
 
    ! Load ML model
    model = torch_module_load(args(1))
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index 2ee9de33..465de23a 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -77,9 +77,9 @@ subroutine main()
       call load_data(filename, tensor_length, in_data)
 
       ! Create input/output tensors from the above arrays
-      in_tensors(1) = torch_tensor_from_array(in_data, in_layout, torch_kCPU)
+      call torch_tensor_from_array(in_tensors(1), in_data, in_layout, torch_kCPU)
 
-      out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU)
+      call torch_tensor_from_array(out_tensor, out_data, out_layout, torch_kCPU)
 
       ! Load ML model (edit this line to use different models)
       model = torch_module_load(args(1))
diff --git a/examples/3_MultiGPU/simplenet_infer_fortran.f90 b/examples/3_MultiGPU/simplenet_infer_fortran.f90
index 14ed368a..2f2da041 100644
--- a/examples/3_MultiGPU/simplenet_infer_fortran.f90
+++ b/examples/3_MultiGPU/simplenet_infer_fortran.f90
@@ -49,13 +49,13 @@ program inference
    ! Create Torch input tensor from the above array and assign it to the first (and only)
    ! element in the array of input tensors.
    ! We use the torch_kCUDA device type with device index corresponding to the MPI rank.
-   in_tensors(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA, &
-                                          device_index=rank)
+   call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout,         &
+                           torch_kCUDA, device_index=rank)
 
    ! Create Torch output tensor from the above array.
    ! Here we use the torch_kCPU device type since the tensor is for output only
    ! i.e. to be subsequently used by Fortran on CPU.
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+   call torch_tensor_from_array(out_tensor, out_data, tensor_layout, torch_kCPU)
 
    ! Load ML model. Ensure that the same device type and device index are used
    ! as for the input data.

From 852b154d9d61185a4604ce04ad93b23208068c59 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 09:50:31 +0100
Subject: [PATCH 03/43] Add autograd example

---
 examples/5_Autograd/CMakeLists.txt   | 33 +++++++++++++++++++
 examples/5_Autograd/autograd.f90     | 47 ++++++++++++++++++++++++++++
 examples/5_Autograd/autograd.py      | 17 ++++++++++
 examples/5_Autograd/requirements.txt |  0
 examples/CMakeLists.txt              |  1 +
 run_integration_tests.sh             |  6 +++-
 src/CMakeLists.txt                   |  3 ++
 7 files changed, 106 insertions(+), 1 deletion(-)
 create mode 100644 examples/5_Autograd/CMakeLists.txt
 create mode 100644 examples/5_Autograd/autograd.f90
 create mode 100755 examples/5_Autograd/autograd.py
 create mode 100644 examples/5_Autograd/requirements.txt

diff --git a/examples/5_Autograd/CMakeLists.txt b/examples/5_Autograd/CMakeLists.txt
new file mode 100644
index 00000000..41cd700b
--- /dev/null
+++ b/examples/5_Autograd/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+#policy CMP0076 - target_sources source files are relative to file where target_sources is run
+cmake_policy (SET CMP0076 NEW)
+
+set(PROJECT_NAME AutogradExample)
+
+project(${PROJECT_NAME} LANGUAGES Fortran)
+
+# Build in Debug mode if not specified
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug CACHE STRING "" FORCE)
+endif()
+
+find_package(FTorch)
+message(STATUS "Building with Fortran PyTorch coupling")
+
+# Fortran example
+add_executable(autograd autograd.f90)
+target_link_libraries(autograd PRIVATE FTorch::ftorch)
+
+# Integration testing
+if(CMAKE_BUILD_TESTS)
+  include(CTest)
+
+  # 1. Check the Python Autograd script runs successfully
+  add_test(NAME pyautograd
+    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/autograd.py)
+
+  # 2. Check the Python Autograd script runs successfully
+  add_test(NAME fautograd
+    COMMAND autograd
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+endif()
diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
new file mode 100644
index 00000000..9853d8b1
--- /dev/null
+++ b/examples/5_Autograd/autograd.f90
@@ -0,0 +1,47 @@
+program example
+  use ftorch
+  implicit none
+
+  integer :: tensor_layout(1) = [1]
+  type(torch_tensor) :: a, b, Q
+
+  call torch_tensor_from_array(a, [2.0, 3.0], tensor_layout, torch_kCPU,       &
+                               device_index=0) ! FIXME: requires_grad=.true.
+  call torch_tensor_from_array(b, [6.0, 4.0], tensor_layout, torch_kCPU,       &
+                               device_index=0) ! FIXME: requires_grad=.true.
+
+  ! TODO: Q = 3 * a ** 3 - 2 * b
+  !   Requires overloading elementary operations
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
+  ! ---
+  Q = a
+  print *, "Q = a ="
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
+  ! ---
+  print *, "Q = a + b ="
+  Q = a + b
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
+  ! ---
+  print *, "Q = a * b ="
+  Q = a * b
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
+
+  ! TODO: Backward
+  !   Requires API extension
+
+end program example
diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
new file mode 100755
index 00000000..d8dcc0b7
--- /dev/null
+++ b/examples/5_Autograd/autograd.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python3
+"""
+Autograd demo taken from
+https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
+"""
+import torch
+
+a = torch.tensor([2.0, 3.0], requires_grad=True)
+b = torch.tensor([6.0, 4.0], requires_grad=True)
+
+Q = 3 * a ** 3 - b ** 2
+
+external_grad = torch.tensor([1.0, 1.0])
+Q.backward(gradient=external_grad)
+
+assert torch.allclose(9 * a ** 2, a.grad)
+assert torch.allclose(-2 * b, b.grad)
diff --git a/examples/5_Autograd/requirements.txt b/examples/5_Autograd/requirements.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index ba8566ab..128c1646 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,5 @@
 if(CMAKE_BUILD_TESTS)
   add_subdirectory(1_SimpleNet)
   add_subdirectory(2_ResNet18)
+  add_subdirectory(5_Autograd)
 endif()
diff --git a/run_integration_tests.sh b/run_integration_tests.sh
index 35fa928c..cb1ed790 100755
--- a/run_integration_tests.sh
+++ b/run_integration_tests.sh
@@ -12,7 +12,11 @@
 set -eu
 
 CTEST_ARGS=$@
-EXAMPLES="1_SimpleNet 2_ResNet18"
+EXAMPLES="
+  1_SimpleNet
+  2_ResNet18
+  5_Autograd
+"
 BUILD_DIR=src/build
 
 for EXAMPLE in ${EXAMPLES}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f281a403..760a9c68 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -94,5 +94,8 @@ if(CMAKE_BUILD_TESTS)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/2_ResNet18
     DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples
     )
+  file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/5_Autograd
+    DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples
+    )
   add_subdirectory(test/examples)
 endif()

From f7bca7ef7d406bd458dc0c49f72567594abff4b5 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 09:59:30 +0100
Subject: [PATCH 04/43] Apply Black

---
 examples/5_Autograd/autograd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
index d8dcc0b7..3b663c17 100755
--- a/examples/5_Autograd/autograd.py
+++ b/examples/5_Autograd/autograd.py
@@ -8,10 +8,10 @@
 a = torch.tensor([2.0, 3.0], requires_grad=True)
 b = torch.tensor([6.0, 4.0], requires_grad=True)
 
-Q = 3 * a ** 3 - b ** 2
+Q = 3 * a**3 - b**2
 
 external_grad = torch.tensor([1.0, 1.0])
 Q.backward(gradient=external_grad)
 
-assert torch.allclose(9 * a ** 2, a.grad)
+assert torch.allclose(9 * a**2, a.grad)
 assert torch.allclose(-2 * b, b.grad)

From 52321930fae5c4cbb5d65e57b3c55588d7a4e3a2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 14:14:48 +0100
Subject: [PATCH 05/43] Overload - and test

---
 examples/5_Autograd/autograd.f90 |  8 ++++++++
 src/ctorch.cpp                   | 11 +++++++++++
 src/ctorch.h                     |  9 +++++++++
 src/ftorch.f90                   | 23 +++++++++++++++++++++++
 src/ftorch.fypp                  | 23 +++++++++++++++++++++++
 5 files changed, 74 insertions(+)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 9853d8b1..7302c017 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -40,6 +40,14 @@ program example
   call torch_tensor_print(a)
   print *, "b ="
   call torch_tensor_print(b)
+  ! ---
+  print *, "Q = a - b ="
+  Q = a - b
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
 
   ! TODO: Backward
   !   Requires API extension
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index ecddd188..090c8265 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -212,6 +212,17 @@ torch_tensor_t torch_tensor_add(const torch_tensor_t tensor1,
     return output;
 }
 
+torch_tensor_t torch_tensor_subtract(const torch_tensor_t tensor1,
+                                     const torch_tensor_t tensor2)
+{
+    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
+    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = *t1 - *t2;
+    return output;
+}
+
 torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
                                      const torch_tensor_t tensor2)
 {
diff --git a/src/ctorch.h b/src/ctorch.h
index bc40047c..304cbe10 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -126,6 +126,15 @@ EXPORT_C torch_tensor_t torch_tensor_assign(const torch_tensor_t input);
 EXPORT_C torch_tensor_t torch_tensor_add(const torch_tensor_t tensor1,
                                          const torch_tensor_t tensor2);
 
+/**
+ * Overloads the subtraction operator for two Torch Tensors
+ * @param first Tensor to be subtracted
+ * @param second Tensor to be subtracted
+ * @return difference of the Tensors
+ */
+EXPORT_C torch_tensor_t torch_tensor_subtract(const torch_tensor_t tensor1,
+                                              const torch_tensor_t tensor2);
+
 /**
  * Overloads the multiplication operator for two Torch Tensors
  * @param first Tensor to be multiplied
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 570ad5ce..a5448641 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -102,6 +102,10 @@ end function torch_from_blob_c
     module procedure torch_tensor_add
   end interface
 
+  interface operator (-)
+    module procedure torch_tensor_subtract
+  end interface
+
   interface operator (*)
     module procedure torch_tensor_multiply
   end interface
@@ -329,6 +333,25 @@ end function torch_tensor_add_c
     output%p = torch_tensor_add_c(tensor1%p, tensor2%p)
   end function torch_tensor_add
 
+  !> Overloads subtraction operator for two tensors.
+  function torch_tensor_subtract(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_subtract_c(tensor1_c, tensor2_c) result(output_c)       &
+          bind(c, name = 'torch_tensor_subtract')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_subtract_c
+    end interface
+
+    output%p = torch_tensor_subtract_c(tensor1%p, tensor2%p)
+  end function torch_tensor_subtract
+
   !> Overloads multiplication operator for two tensors.
   function torch_tensor_multiply(tensor1, tensor2) result(output)
     type(torch_tensor), intent(in) :: tensor1
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 07adec74..2b83ddc6 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -100,6 +100,10 @@ module ftorch
     module procedure torch_tensor_add
   end interface
 
+  interface operator (-)
+    module procedure torch_tensor_subtract
+  end interface
+
   interface operator (*)
     module procedure torch_tensor_multiply
   end interface
@@ -327,6 +331,25 @@ contains
     output%p = torch_tensor_add_c(tensor1%p, tensor2%p)
   end function torch_tensor_add
 
+  !> Overloads subtraction operator for two tensors.
+  function torch_tensor_subtract(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_subtract_c(tensor1_c, tensor2_c) result(output_c)       &
+          bind(c, name = 'torch_tensor_subtract')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_subtract_c
+    end interface
+
+    output%p = torch_tensor_subtract_c(tensor1%p, tensor2%p)
+  end function torch_tensor_subtract
+
   !> Overloads multiplication operator for two tensors.
   function torch_tensor_multiply(tensor1, tensor2) result(output)
     type(torch_tensor), intent(in) :: tensor1

From a730a4062e7ab77bc936030f6f0fcd3ced9be015 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 14:17:30 +0100
Subject: [PATCH 06/43] Overload / and test

---
 examples/5_Autograd/autograd.f90 |  8 ++++++++
 src/ctorch.cpp                   | 11 +++++++++++
 src/ctorch.h                     |  9 +++++++++
 src/ftorch.f90                   | 23 +++++++++++++++++++++++
 src/ftorch.fypp                  | 23 +++++++++++++++++++++++
 5 files changed, 74 insertions(+)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 7302c017..3b796333 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -48,6 +48,14 @@ program example
   call torch_tensor_print(a)
   print *, "b ="
   call torch_tensor_print(b)
+  ! ---
+  print *, "Q = a / b ="
+  Q = a / b
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  print *, "b ="
+  call torch_tensor_print(b)
 
   ! TODO: Backward
   !   Requires API extension
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 090c8265..598caff8 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -234,6 +234,17 @@ torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
     return output;
 }
 
+torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
+                                   const torch_tensor_t tensor2)
+{
+    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
+    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = *t1 / *t2;
+    return output;
+}
+
 torch_jit_script_module_t torch_jit_load(const char* filename,
                                          const torch_device_t device_type = torch_kCPU,
                                          const int device_index = -1,
diff --git a/src/ctorch.h b/src/ctorch.h
index 304cbe10..f31c6511 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -144,6 +144,15 @@ EXPORT_C torch_tensor_t torch_tensor_subtract(const torch_tensor_t tensor1,
 EXPORT_C torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
                                               const torch_tensor_t tensor2);
 
+/**
+ * Overloads the division operator for two Torch Tensors
+ * @param first Tensor to be divided
+ * @param second Tensor to be divided
+ * @return quotient of the Tensors
+ */
+EXPORT_C torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
+                                            const torch_tensor_t tensor2);
+
 
 // =====================================================================================
 // Module API
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index a5448641..0788d5d3 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -110,6 +110,10 @@ end function torch_from_blob_c
     module procedure torch_tensor_multiply
   end interface
 
+  interface operator (/)
+    module procedure torch_tensor_divide
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -371,6 +375,25 @@ end function torch_tensor_multiply_c
     output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
   end function torch_tensor_multiply
 
+  !> Overloads division operator for two tensors.
+  function torch_tensor_divide(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_divide_c(tensor1_c, tensor2_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_divide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_divide_c
+    end interface
+
+    output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
+  end function torch_tensor_divide
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 2b83ddc6..c02cc38c 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -108,6 +108,10 @@ module ftorch
     module procedure torch_tensor_multiply
   end interface
 
+  interface operator (/)
+    module procedure torch_tensor_divide
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -369,6 +373,25 @@ contains
     output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
   end function torch_tensor_multiply
 
+  !> Overloads division operator for two tensors.
+  function torch_tensor_divide(tensor1, tensor2) result(output)
+    type(torch_tensor), intent(in) :: tensor1
+    type(torch_tensor), intent(in) :: tensor2
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_divide_c(tensor1_c, tensor2_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_divide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        type(c_ptr), value, intent(in) :: tensor1_c
+        type(c_ptr), value, intent(in) :: tensor2_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_divide_c
+    end interface
+
+    output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
+  end function torch_tensor_divide
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)

From 7abeba7be5d56a948d85e50b8de0402c7f5f9409 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 14:32:36 +0100
Subject: [PATCH 07/43] Overload ** and test

---
 examples/5_Autograd/autograd.f90 |   6 ++
 src/ctorch.cpp                   |  10 +++
 src/ctorch.h                     |   9 +++
 src/ftorch.f90                   | 130 +++++++++++++++++++++++++++++++
 src/ftorch.fypp                  |  29 +++++++
 5 files changed, 184 insertions(+)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 3b796333..1ea9c769 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -56,6 +56,12 @@ program example
   call torch_tensor_print(a)
   print *, "b ="
   call torch_tensor_print(b)
+  ! ---
+  print *, "Q = a ** 2 ="
+  Q = a ** 2
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
 
   ! TODO: Backward
   !   Requires API extension
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 598caff8..569b4070 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -245,6 +245,16 @@ torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
     return output;
 }
 
+torch_tensor_t torch_tensor_power(const torch_tensor_t tensor,
+                                  const torch_data_t exponent)
+{
+    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = pow(*t, exponent);
+    return output;
+}
+
 torch_jit_script_module_t torch_jit_load(const char* filename,
                                          const torch_device_t device_type = torch_kCPU,
                                          const int device_index = -1,
diff --git a/src/ctorch.h b/src/ctorch.h
index f31c6511..46aec19b 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -153,6 +153,15 @@ EXPORT_C torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
 EXPORT_C torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
                                             const torch_tensor_t tensor2);
 
+/**
+ * Overloads the exponentiation operator for two Torch Tensors
+ * @param Tensor to take the power of
+ * @param scalar exponent
+ * @return power of the Tensor
+ */
+EXPORT_C torch_tensor_t torch_tensor_power(const torch_tensor_t tensor,
+                                           const torch_data_t exponent);
+
 
 // =====================================================================================
 // Module API
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 0788d5d3..66bd76dc 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -114,6 +114,15 @@ end function torch_from_blob_c
     module procedure torch_tensor_divide
   end interface
 
+  interface operator (**)
+    module procedure torch_tensor_power_int8
+    module procedure torch_tensor_power_int16
+    module procedure torch_tensor_power_int32
+    module procedure torch_tensor_power_int64
+    module procedure torch_tensor_power_real32
+    module procedure torch_tensor_power_real64
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -394,6 +403,127 @@ end function torch_tensor_divide_c
     output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
   end function torch_tensor_divide
 
+  !> Overloads exponentiation operator for a tensor and a scalar of type `int8`
+  function torch_tensor_power_int8(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int8), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int8
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int8), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_int8
+
+  !> Overloads exponentiation operator for a tensor and a scalar of type `int16`
+  function torch_tensor_power_int16(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int16), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int16
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int16), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_int16
+
+  !> Overloads exponentiation operator for a tensor and a scalar of type `int32`
+  function torch_tensor_power_int32(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int32), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int32
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int32), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_int32
+
+  !> Overloads exponentiation operator for a tensor and a scalar of type `int64`
+  function torch_tensor_power_int64(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int64), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int64
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int64), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_int64
+
+  !> Overloads exponentiation operator for a tensor and a scalar of type `real32`
+  function torch_tensor_power_real32(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real32), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real32
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real32), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_real32
+
+  !> Overloads exponentiation operator for a tensor and a scalar of type `real64`
+  function torch_tensor_power_real64(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real64), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real64
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real64), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_real64
+
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index c02cc38c..c3039ea1 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -112,6 +112,12 @@ module ftorch
     module procedure torch_tensor_divide
   end interface
 
+  interface operator (**)
+    #:for PREC in PRECISIONS
+    module procedure torch_tensor_power_${PREC}$
+    #:endfor
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -392,6 +398,29 @@ contains
     output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
   end function torch_tensor_divide
 
+  #:for PREC in PRECISIONS
+  !> Overloads exponentiation operator for a tensor and a scalar of type `${PREC}$`
+  function torch_tensor_power_${PREC}$(tensor, power) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    ${f_type(PREC)}$(kind=${PREC}$), intent(in) :: power
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
+          bind(c, name = 'torch_tensor_power')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        type(c_ptr), value, intent(in) :: tensor_c
+        ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: power_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_power_c
+    end interface
+
+    output%p = torch_tensor_power_c(tensor%p, power)
+  end function torch_tensor_power_${PREC}$
+
+  #:endfor
+
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
   function torch_module_load(filename, device_type, device_index, requires_grad_opt, is_training_opt) result(module)

From c6503426bf5c6b16b0660760412df3bb09cee803 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 15:13:40 +0100
Subject: [PATCH 08/43] Overload scalar premultiply and test

---
 examples/5_Autograd/autograd.f90 |  14 +++-
 src/ctorch.cpp                   |  11 +++
 src/ctorch.h                     |   9 +++
 src/ftorch.f90                   | 126 +++++++++++++++++++++++++++++++
 src/ftorch.fypp                  |  25 ++++++
 5 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 1ea9c769..0f5a06af 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -33,22 +33,28 @@ program example
   print *, "b ="
   call torch_tensor_print(b)
   ! ---
-  print *, "Q = a * b ="
-  Q = a * b
+  print *, "Q = a - b ="
+  Q = a - b
   call torch_tensor_print(Q)
   print *, "a ="
   call torch_tensor_print(a)
   print *, "b ="
   call torch_tensor_print(b)
   ! ---
-  print *, "Q = a - b ="
-  Q = a - b
+  print *, "Q = a * b ="
+  Q = a * b
   call torch_tensor_print(Q)
   print *, "a ="
   call torch_tensor_print(a)
   print *, "b ="
   call torch_tensor_print(b)
   ! ---
+  print *, "Q = 3 * a ="
+  Q = 3 * a
+  call torch_tensor_print(Q)
+  print *, "a ="
+  call torch_tensor_print(a)
+  ! ---
   print *, "Q = a / b ="
   Q = a / b
   call torch_tensor_print(Q)
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 569b4070..5b0ce134 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -234,6 +234,17 @@ torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
     return output;
 }
 
+torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
+                                        const torch_tensor_t tensor)
+                                     
+{
+    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = scalar * *t;
+    return output;
+}
+
 torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
                                    const torch_tensor_t tensor2)
 {
diff --git a/src/ctorch.h b/src/ctorch.h
index 46aec19b..c1ca8fef 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -144,6 +144,15 @@ EXPORT_C torch_tensor_t torch_tensor_subtract(const torch_tensor_t tensor1,
 EXPORT_C torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
                                               const torch_tensor_t tensor2);
 
+/**
+ * Overloads the premultiplication operator for a scalar and a Torch Tensor
+ * @param scalar to multiply by
+ * @param Tensor to be multiplied
+ * @return product of the scalar and Tensor
+ */
+EXPORT_C torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
+                                                 const torch_tensor_t tensor);
+
 /**
  * Overloads the division operator for two Torch Tensors
  * @param first Tensor to be divided
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 66bd76dc..3c93a8fa 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -108,6 +108,12 @@ end function torch_from_blob_c
 
   interface operator (*)
     module procedure torch_tensor_multiply
+    module procedure torch_tensor_premultiply_int8
+    module procedure torch_tensor_premultiply_int16
+    module procedure torch_tensor_premultiply_int32
+    module procedure torch_tensor_premultiply_int64
+    module procedure torch_tensor_premultiply_real32
+    module procedure torch_tensor_premultiply_real64
   end interface
 
   interface operator (/)
@@ -384,6 +390,126 @@ end function torch_tensor_multiply_c
     output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
   end function torch_tensor_multiply
 
+  !> Overloads multiplication operator for a scalar of type int8 and a tensor.
+  function torch_tensor_premultiply_int8(scalar, tensor) result(output)
+    integer(kind=int8), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int8
+        integer(kind=int8), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_int8
+
+  !> Overloads multiplication operator for a scalar of type int16 and a tensor.
+  function torch_tensor_premultiply_int16(scalar, tensor) result(output)
+    integer(kind=int16), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int16
+        integer(kind=int16), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_int16
+
+  !> Overloads multiplication operator for a scalar of type int32 and a tensor.
+  function torch_tensor_premultiply_int32(scalar, tensor) result(output)
+    integer(kind=int32), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int32
+        integer(kind=int32), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_int32
+
+  !> Overloads multiplication operator for a scalar of type int64 and a tensor.
+  function torch_tensor_premultiply_int64(scalar, tensor) result(output)
+    integer(kind=int64), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int64
+        integer(kind=int64), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_int64
+
+  !> Overloads multiplication operator for a scalar of type real32 and a tensor.
+  function torch_tensor_premultiply_real32(scalar, tensor) result(output)
+    real(kind=real32), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real32
+        real(kind=real32), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_real32
+
+  !> Overloads multiplication operator for a scalar of type real64 and a tensor.
+  function torch_tensor_premultiply_real64(scalar, tensor) result(output)
+    real(kind=real64), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real64
+        real(kind=real64), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_real64
+
   !> Overloads division operator for two tensors.
   function torch_tensor_divide(tensor1, tensor2) result(output)
     type(torch_tensor), intent(in) :: tensor1
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index c3039ea1..2e042fde 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -106,6 +106,9 @@ module ftorch
 
   interface operator (*)
     module procedure torch_tensor_multiply
+    #:for PREC in PRECISIONS
+    module procedure torch_tensor_premultiply_${PREC}$
+    #:endfor
   end interface
 
   interface operator (/)
@@ -379,6 +382,28 @@ contains
     output%p = torch_tensor_multiply_c(tensor1%p, tensor2%p)
   end function torch_tensor_multiply
 
+  #:for PREC in PRECISIONS
+  !> Overloads multiplication operator for a scalar of type ${PREC}$ and a tensor.
+  function torch_tensor_premultiply_${PREC}$(scalar, tensor) result(output)
+    ${f_type(PREC)}$(kind=${PREC}$), intent(in) :: scalar
+    type(torch_tensor), intent(in) :: tensor
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_premultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: scalar_c
+        type(c_ptr), value, intent(in) :: tensor_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_premultiply_c
+    end interface
+
+    output%p = torch_tensor_premultiply_c(scalar, tensor%p)
+  end function torch_tensor_premultiply_${PREC}$
+
+  #:endfor
   !> Overloads division operator for two tensors.
   function torch_tensor_divide(tensor1, tensor2) result(output)
     type(torch_tensor), intent(in) :: tensor1

From c0c55073705669e66b30e1a0f114f4b26648c4fb Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 15:18:54 +0100
Subject: [PATCH 09/43] Overload scalar postmultiply and test

---
 examples/5_Autograd/autograd.f90 |   6 ++
 run_integration_tests.sh         |  11 +--
 src/ctorch.cpp                   |  12 +++
 src/ctorch.h                     |   9 +++
 src/ftorch.f90                   | 127 +++++++++++++++++++++++++++++++
 src/ftorch.fypp                  |  24 ++++++
 6 files changed, 184 insertions(+), 5 deletions(-)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 0f5a06af..0b55bcb2 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -55,6 +55,12 @@ program example
   print *, "a ="
   call torch_tensor_print(a)
   ! ---
+  print *, "Q = b * 2 ="
+  Q = b * 2
+  call torch_tensor_print(Q)
+  print *, "b ="
+  call torch_tensor_print(b)
+  ! ---
   print *, "Q = a / b ="
   Q = a / b
   call torch_tensor_print(Q)
diff --git a/run_integration_tests.sh b/run_integration_tests.sh
index cb1ed790..ec4e367b 100755
--- a/run_integration_tests.sh
+++ b/run_integration_tests.sh
@@ -12,11 +12,12 @@
 set -eu
 
 CTEST_ARGS=$@
-EXAMPLES="
-  1_SimpleNet
-  2_ResNet18
-  5_Autograd
-"
+# EXAMPLES="
+#   1_SimpleNet
+#   2_ResNet18
+#   5_Autograd
+# "
+EXAMPLES="5_Autograd"
 BUILD_DIR=src/build
 
 for EXAMPLE in ${EXAMPLES}
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 5b0ce134..b62d788e 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -245,6 +245,18 @@ torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
     return output;
 }
 
+torch_tensor_t torch_tensor_postmultiply(const torch_tensor_t tensor,
+                                         const torch_data_t scalar)
+                                        
+                                     
+{
+    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
+    torch::Tensor* output = nullptr;
+    output = new torch::Tensor;
+    *output = *t * scalar;
+    return output;
+}
+
 torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
                                    const torch_tensor_t tensor2)
 {
diff --git a/src/ctorch.h b/src/ctorch.h
index c1ca8fef..3d893482 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -153,6 +153,15 @@ EXPORT_C torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
 EXPORT_C torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
                                                  const torch_tensor_t tensor);
 
+/**
+ * Overloads the postmultiplication operator for a scalar and a Torch Tensor
+ * @param Tensor to be multiplied
+ * @param scalar to multiply by
+ * @return product of the Tensor and scalar
+ */
+EXPORT_C torch_tensor_t torch_tensor_postmultiply(const torch_tensor_t tensor,
+                                                  const torch_data_t scalar);
+
 /**
  * Overloads the division operator for two Torch Tensors
  * @param first Tensor to be divided
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 3c93a8fa..ee496093 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -109,11 +109,17 @@ end function torch_from_blob_c
   interface operator (*)
     module procedure torch_tensor_multiply
     module procedure torch_tensor_premultiply_int8
+    module procedure torch_tensor_postmultiply_int8
     module procedure torch_tensor_premultiply_int16
+    module procedure torch_tensor_postmultiply_int16
     module procedure torch_tensor_premultiply_int32
+    module procedure torch_tensor_postmultiply_int32
     module procedure torch_tensor_premultiply_int64
+    module procedure torch_tensor_postmultiply_int64
     module procedure torch_tensor_premultiply_real32
+    module procedure torch_tensor_postmultiply_real32
     module procedure torch_tensor_premultiply_real64
+    module procedure torch_tensor_postmultiply_real64
   end interface
 
   interface operator (/)
@@ -510,6 +516,127 @@ end function torch_tensor_premultiply_c
     output%p = torch_tensor_premultiply_c(scalar, tensor%p)
   end function torch_tensor_premultiply_real64
 
+
+  !> Overloads multiplication operator for a tensor and a scalar of type int8.
+  function torch_tensor_postmultiply_int8(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int8), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int8
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int8), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_int8
+
+  !> Overloads multiplication operator for a tensor and a scalar of type int16.
+  function torch_tensor_postmultiply_int16(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int16), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int16
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int16), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_int16
+
+  !> Overloads multiplication operator for a tensor and a scalar of type int32.
+  function torch_tensor_postmultiply_int32(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int32), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int32
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int32), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_int32
+
+  !> Overloads multiplication operator for a tensor and a scalar of type int64.
+  function torch_tensor_postmultiply_int64(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int64), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int64
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int64), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_int64
+
+  !> Overloads multiplication operator for a tensor and a scalar of type real32.
+  function torch_tensor_postmultiply_real32(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real32), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real32
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real32), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_real32
+
+  !> Overloads multiplication operator for a tensor and a scalar of type real64.
+  function torch_tensor_postmultiply_real64(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real64), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real64
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real64), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_real64
+
   !> Overloads division operator for two tensors.
   function torch_tensor_divide(tensor1, tensor2) result(output)
     type(torch_tensor), intent(in) :: tensor1
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 2e042fde..f5e0f1c9 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -108,6 +108,7 @@ module ftorch
     module procedure torch_tensor_multiply
     #:for PREC in PRECISIONS
     module procedure torch_tensor_premultiply_${PREC}$
+    module procedure torch_tensor_postmultiply_${PREC}$
     #:endfor
   end interface
 
@@ -403,6 +404,29 @@ contains
     output%p = torch_tensor_premultiply_c(scalar, tensor%p)
   end function torch_tensor_premultiply_${PREC}$
 
+  #:endfor
+
+  #:for PREC in PRECISIONS
+  !> Overloads multiplication operator for a tensor and a scalar of type ${PREC}$.
+  function torch_tensor_postmultiply_${PREC}$(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    ${f_type(PREC)}$(kind=${PREC}$), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postmultiply_c(tensor_c, scalar_c) result(output_c)  &
+          bind(c, name = 'torch_tensor_postmultiply')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        type(c_ptr), value, intent(in) :: tensor_c
+        ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postmultiply_c
+    end interface
+
+    output%p = torch_tensor_postmultiply_c(tensor%p, scalar)
+  end function torch_tensor_postmultiply_${PREC}$
+
   #:endfor
   !> Overloads division operator for two tensors.
   function torch_tensor_divide(tensor1, tensor2) result(output)

From 6aa37680f8810bc8ac14ed0ed5deef4488c6135a Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 15:38:18 +0100
Subject: [PATCH 10/43] Tidy overloading test - FIXME

---
 examples/5_Autograd/CMakeLists.txt |   6 +-
 examples/5_Autograd/autograd.f90   | 105 ++++++++++-------------------
 examples/5_Autograd/autograd.py    |   1 +
 run_integration_tests.sh           |  11 ++-
 4 files changed, 48 insertions(+), 75 deletions(-)

diff --git a/examples/5_Autograd/CMakeLists.txt b/examples/5_Autograd/CMakeLists.txt
index 41cd700b..1f16b9ec 100644
--- a/examples/5_Autograd/CMakeLists.txt
+++ b/examples/5_Autograd/CMakeLists.txt
@@ -25,9 +25,13 @@ if(CMAKE_BUILD_TESTS)
   # 1. Check the Python Autograd script runs successfully
   add_test(NAME pyautograd
     COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/autograd.py)
+  set_tests_properties(pyautograd PROPERTIES PASS_REGULAR_EXPRESSION
+    "-12.,  65.")
 
-  # 2. Check the Python Autograd script runs successfully
+  # 2. Check the Fortran Autograd script runs successfully
   add_test(NAME fautograd
     COMMAND autograd
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+  set_tests_properties(fautograd PROPERTIES PASS_REGULAR_EXPRESSION
+    "-12.00000000      65.00000000")
 endif()
diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 0b55bcb2..12c4f323 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -1,81 +1,50 @@
 program example
+
+  ! Import precision info from iso
+  use, intrinsic :: iso_fortran_env, only : sp => real32
+
+  ! Import our library for interfacing with PyTorch's Autograd module
   use ftorch
+
   implicit none
 
+  ! Set working precision for reals
+  integer, parameter :: wp = sp
+
+   ! Set up Fortran data structures
+  real(wp), dimension(2), target :: in_data1
+  real(wp), dimension(2), target :: in_data2
+  real(wp), dimension(2), target :: out_data
   integer :: tensor_layout(1) = [1]
+
+   ! Set up Torch data structures
   type(torch_tensor) :: a, b, Q
 
-  call torch_tensor_from_array(a, [2.0, 3.0], tensor_layout, torch_kCPU,       &
-                               device_index=0) ! FIXME: requires_grad=.true.
-  call torch_tensor_from_array(b, [6.0, 4.0], tensor_layout, torch_kCPU,       &
-                               device_index=0) ! FIXME: requires_grad=.true.
+   ! Initialise data
+  in_data1(:) = [2.0, 3.0]
+  in_data2(:) = [6.0, 4.0]
+
+  ! FIXME: requires_grad=.true.
+  call torch_tensor_from_array(a, in_data1, tensor_layout, torch_kCPU)
+  call torch_tensor_from_array(b, in_data2, tensor_layout, torch_kCPU)
+  call torch_tensor_from_array(Q, out_data, tensor_layout, torch_kCPU)
 
-  ! TODO: Q = 3 * a ** 3 - 2 * b
-  !   Requires overloading elementary operations
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  Q = a
-  print *, "Q = a ="
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = a + b ="
-  Q = a + b
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = a - b ="
-  Q = a - b
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = a * b ="
-  Q = a * b
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = 3 * a ="
-  Q = 3 * a
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  ! ---
-  print *, "Q = b * 2 ="
-  Q = b * 2
-  call torch_tensor_print(Q)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = a / b ="
-  Q = a / b
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
-  print *, "b ="
-  call torch_tensor_print(b)
-  ! ---
-  print *, "Q = a ** 2 ="
-  Q = a ** 2
-  call torch_tensor_print(Q)
-  print *, "a ="
-  call torch_tensor_print(a)
+  ! Check arithmetic operations work for torch_tensors
+  write (*,*) "a = ", in_data1(:)
+  write (*,*) "b = ", in_data2(:)
+  Q = 3 * a ** 3 - b ** 2
+  write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:)
+
+  ! Check a and b are unchanged by the arithmetic operations
+  write (*,*) "a = ", in_data1(:)
+  write (*,*) "b = ", in_data2(:)
 
   ! TODO: Backward
   !   Requires API extension
 
+  ! Cleanup
+  call torch_tensor_delete(a)
+  call torch_tensor_delete(b)
+  call torch_tensor_delete(Q)
+
 end program example
diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
index 3b663c17..b4975e45 100755
--- a/examples/5_Autograd/autograd.py
+++ b/examples/5_Autograd/autograd.py
@@ -9,6 +9,7 @@
 b = torch.tensor([6.0, 4.0], requires_grad=True)
 
 Q = 3 * a**3 - b**2
+print(Q)
 
 external_grad = torch.tensor([1.0, 1.0])
 Q.backward(gradient=external_grad)
diff --git a/run_integration_tests.sh b/run_integration_tests.sh
index ec4e367b..cb1ed790 100755
--- a/run_integration_tests.sh
+++ b/run_integration_tests.sh
@@ -12,12 +12,11 @@
 set -eu
 
 CTEST_ARGS=$@
-# EXAMPLES="
-#   1_SimpleNet
-#   2_ResNet18
-#   5_Autograd
-# "
-EXAMPLES="5_Autograd"
+EXAMPLES="
+  1_SimpleNet
+  2_ResNet18
+  5_Autograd
+"
 BUILD_DIR=src/build
 
 for EXAMPLE in ${EXAMPLES}

From 7b9126e565319042e10d6f1dbcc0319edf9c4741 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 16:19:16 +0100
Subject: [PATCH 11/43] Formatting

---
 examples/5_Autograd/autograd.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
index b4975e45..4c12fe1f 100755
--- a/examples/5_Autograd/autograd.py
+++ b/examples/5_Autograd/autograd.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
-Autograd demo taken from
-https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
+Autograd demo taken from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html.
 """
+
 import torch
 
 a = torch.tensor([2.0, 3.0], requires_grad=True)

From e6c7675c2d5092a7322bd8b079212e563000e1b9 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 21 Jun 2024 16:20:19 +0100
Subject: [PATCH 12/43] Reformatting

---
 examples/5_Autograd/autograd.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
index 4c12fe1f..1c57c19b 100755
--- a/examples/5_Autograd/autograd.py
+++ b/examples/5_Autograd/autograd.py
@@ -1,7 +1,5 @@
 #!/usr/bin/env python3
-"""
-Autograd demo taken from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html.
-"""
+"""Autograd demo taken from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html."""
 
 import torch
 

From 528b4f9ad41c20e760529e61160daaa5d926c85b Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 27 Jun 2024 12:38:02 +0100
Subject: [PATCH 13/43] Fix merge issue

---
 src/ftorch.f90  | 48 ++++++++++++++++++++++++------------------------
 src/ftorch.fypp |  2 +-
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/ftorch.f90 b/src/ftorch.f90
index cf9b302d..0bff7a48 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -908,7 +908,7 @@ end subroutine torch_model_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -962,7 +962,7 @@ end subroutine torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1016,7 +1016,7 @@ end subroutine torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1070,7 +1070,7 @@ end subroutine torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1124,7 +1124,7 @@ end subroutine torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1178,7 +1178,7 @@ end subroutine torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1232,7 +1232,7 @@ end subroutine torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1286,7 +1286,7 @@ end subroutine torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1340,7 +1340,7 @@ end subroutine torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1394,7 +1394,7 @@ end subroutine torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1448,7 +1448,7 @@ end subroutine torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1502,7 +1502,7 @@ end subroutine torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1556,7 +1556,7 @@ end subroutine torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1610,7 +1610,7 @@ end subroutine torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1664,7 +1664,7 @@ end subroutine torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1718,7 +1718,7 @@ end subroutine torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1772,7 +1772,7 @@ end subroutine torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1826,7 +1826,7 @@ end subroutine torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1880,7 +1880,7 @@ end subroutine torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1934,7 +1934,7 @@ end subroutine torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1988,7 +1988,7 @@ end subroutine torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2042,7 +2042,7 @@ end subroutine torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2096,7 +2096,7 @@ end subroutine torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2150,7 +2150,7 @@ end subroutine torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 6c6ef667..12b7b4ba 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -603,7 +603,7 @@ contains
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
   subroutine torch_tensor_from_array_${PREC}$_${RANK}$d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad_opt)
+                                                        device_type, device_index, requires_grad_opt)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 

From dfaba4d60f2132ce9ae10a6a7b02d8a5af90fe03 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 27 Jun 2024 12:38:19 +0100
Subject: [PATCH 14/43] Include example 3 for consistency

---
 examples/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a993299d..f6e38bbe 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,7 +1,9 @@
 if(CMAKE_BUILD_TESTS)
   add_subdirectory(1_SimpleNet)
   add_subdirectory(2_ResNet18)
-  # add_subdirectory(3_MultiGPU)
+  if(ENABLE_CUDA)
+    add_subdirectory(3_MultiGPU)
+  endif()
   add_subdirectory(4_MultiIO)
   add_subdirectory(5_Autograd)
 endif()

From 202f77f76fc09bdfd4a68a1e020ff294c053956e Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 27 Jun 2024 12:38:30 +0100
Subject: [PATCH 15/43] DO NOT MERGE (debugging)

---
 examples/5_Autograd/autograd.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 12c4f323..e075c977 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -33,6 +33,7 @@ program example
   write (*,*) "a = ", in_data1(:)
   write (*,*) "b = ", in_data2(:)
   Q = 3 * a ** 3 - b ** 2
+  call torch_tensor_print(Q) ! TODO: Temporary (debug)
   write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:)
 
   ! Check a and b are unchanged by the arithmetic operations

From 1eb6d526caf45419777b4c60268891283e27b769 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 15 Jul 2024 11:59:41 +0100
Subject: [PATCH 16/43] Implement torch_to_blob on C++ side

---
 src/ctorch.cpp | 37 +++++++++++++++++++++++++++++++++++++
 src/ctorch.h   |  9 +++++++++
 2 files changed, 46 insertions(+)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 359c733d..fd99895e 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -173,6 +173,43 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
   return tensor;
 }
 
+// FIXME: Why can't we use torch::kUInt8, etc.?
+void* torch_to_blob(const torch_tensor_t tensor, const torch_data_t dtype)
+{
+    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
+    void* raw_ptr;
+    switch (dtype) {
+    case torch_kUInt8:
+      raw_ptr = (void*) t->data_ptr<int>();
+      break;
+    case torch_kInt8:
+      raw_ptr = (void*) t->data_ptr<int>();
+      break;
+    case torch_kInt16:
+      raw_ptr = (void*) t->data_ptr<int>();
+      break;
+    case torch_kInt32:
+      raw_ptr = (void*) t->data_ptr<int>();
+      break;
+    case torch_kInt64:
+      raw_ptr = (void*) t->data_ptr<int>();
+      break;
+    case torch_kFloat16:
+      raw_ptr = (void*) t->data_ptr<float>();
+      break;
+    case torch_kFloat32:
+      raw_ptr = (void*) t->data_ptr<float>();
+      break;
+    case torch_kFloat64:
+      raw_ptr = (void*) t->data_ptr<float>();
+      break;
+    default:
+      std::cerr << "[WARNING]: unknown data type" << std::endl;
+      exit(EXIT_FAILURE);
+    }
+    return raw_ptr;
+}
+
 void torch_tensor_print(const torch_tensor_t tensor)
 {
   auto t = reinterpret_cast<torch::Tensor*>(tensor);
diff --git a/src/ctorch.h b/src/ctorch.h
index 39117f72..32e84188 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -91,6 +91,15 @@ EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
                                         int device_index,
                                         const bool requires_grad);
 
+/**
+ * Function to extract a C-array from a Torch Tensor's data.
+ *
+ * @param the Torch Tensor
+ * @param data type of the elements of the Tensor
+ * @return pointer to the Tensor in memory
+ */
+EXPORT_C void* torch_to_blob(const torch_tensor_t tensor, const torch_data_t dtype);
+
 /**
  * Function to print out a Torch Tensor
  * @param Torch Tensor to print

From 60ce4a6156ea3c0be3667f8e0a8b1840d4f14702 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 22 Jul 2024 17:38:13 +0100
Subject: [PATCH 17/43] Implement Fortran interface

---
 src/ftorch.f90  | 449 +++++++++++++++++++++++++++++++++++++++++++++++-
 src/ftorch.fypp |  43 ++++-
 2 files changed, 490 insertions(+), 2 deletions(-)

diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 0009ed0c..18620076 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -9,7 +9,7 @@
 module ftorch
 
   use, intrinsic :: iso_c_binding, only: c_int, c_int8_t, c_int16_t, c_int32_t, c_int64_t, c_int64_t, &
-                                         c_float, c_double, c_char, c_ptr, c_null_ptr
+                                         c_float, c_double, c_char, c_ptr, c_null_ptr, c_f_pointer
   use, intrinsic :: iso_fortran_env, only: int8, int16, int32, int64, real32, real64
 
   implicit none
@@ -74,6 +74,34 @@ module ftorch
     module procedure torch_tensor_from_array_real64_4d
   end interface
 
+  !> Interface for directing `torch_tensor_to_array` to possible input types and ranks
+  interface torch_tensor_to_array
+    module procedure torch_tensor_to_array_int8_1d
+    module procedure torch_tensor_to_array_int8_2d
+    module procedure torch_tensor_to_array_int8_3d
+    module procedure torch_tensor_to_array_int8_4d
+    module procedure torch_tensor_to_array_int16_1d
+    module procedure torch_tensor_to_array_int16_2d
+    module procedure torch_tensor_to_array_int16_3d
+    module procedure torch_tensor_to_array_int16_4d
+    module procedure torch_tensor_to_array_int32_1d
+    module procedure torch_tensor_to_array_int32_2d
+    module procedure torch_tensor_to_array_int32_3d
+    module procedure torch_tensor_to_array_int32_4d
+    module procedure torch_tensor_to_array_int64_1d
+    module procedure torch_tensor_to_array_int64_2d
+    module procedure torch_tensor_to_array_int64_3d
+    module procedure torch_tensor_to_array_int64_4d
+    module procedure torch_tensor_to_array_real32_1d
+    module procedure torch_tensor_to_array_real32_2d
+    module procedure torch_tensor_to_array_real32_3d
+    module procedure torch_tensor_to_array_real32_4d
+    module procedure torch_tensor_to_array_real64_1d
+    module procedure torch_tensor_to_array_real64_2d
+    module procedure torch_tensor_to_array_real64_3d
+    module procedure torch_tensor_to_array_real64_4d
+  end interface
+
   !> Interface for deleting generic torch objects
   interface torch_delete
     module procedure torch_model_delete
@@ -101,6 +129,16 @@ function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, &
     end function torch_from_blob_c
   end interface
 
+  interface
+    function torch_to_blob_c(tensor, dtype) result(data) &
+        bind(c, name = 'torch_to_blob')
+      use, intrinsic :: iso_c_binding, only : c_int, c_ptr
+      type(c_ptr), value, intent(in)    :: tensor
+      integer(c_int), value, intent(in) :: dtype
+      type(c_ptr)                       :: data
+    end function torch_to_blob_c
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -1729,4 +1767,413 @@ subroutine torch_tensor_from_array_real64_4d(tensor, data_in, layout, &
   end subroutine torch_tensor_from_array_real64_4d
 
 
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `int8`
+  subroutine torch_tensor_to_array_int8_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int8
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int8), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int8_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `int8`
+  subroutine torch_tensor_to_array_int8_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int8
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int8), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int8_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `int8`
+  subroutine torch_tensor_to_array_int8_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int8
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int8), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int8_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `int8`
+  subroutine torch_tensor_to_array_int8_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int8
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int8), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int8_4d
+
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `int16`
+  subroutine torch_tensor_to_array_int16_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int16
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int16), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int16_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `int16`
+  subroutine torch_tensor_to_array_int16_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int16
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int16), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int16_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `int16`
+  subroutine torch_tensor_to_array_int16_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int16
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int16), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int16_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `int16`
+  subroutine torch_tensor_to_array_int16_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int16
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int16), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int16_4d
+
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `int32`
+  subroutine torch_tensor_to_array_int32_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int32), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int32_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `int32`
+  subroutine torch_tensor_to_array_int32_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int32), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int32_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `int32`
+  subroutine torch_tensor_to_array_int32_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int32), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int32_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `int32`
+  subroutine torch_tensor_to_array_int32_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int32), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int32_4d
+
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `int64`
+  subroutine torch_tensor_to_array_int64_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int64), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int64_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `int64`
+  subroutine torch_tensor_to_array_int64_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int64), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int64_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `int64`
+  subroutine torch_tensor_to_array_int64_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int64), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int64_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `int64`
+  subroutine torch_tensor_to_array_int64_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : int64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    integer(kind=int64), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_int64_4d
+
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `real32`
+  subroutine torch_tensor_to_array_real32_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real32), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real32_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `real32`
+  subroutine torch_tensor_to_array_real32_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real32), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real32_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `real32`
+  subroutine torch_tensor_to_array_real32_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real32), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real32_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `real32`
+  subroutine torch_tensor_to_array_real32_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real32
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real32), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real32_4d
+
+  !> Return the array data associated with a Torch tensor of rank 1 and data type `real64`
+  subroutine torch_tensor_to_array_real64_1d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real64), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real64_1d
+
+  !> Return the array data associated with a Torch tensor of rank 2 and data type `real64`
+  subroutine torch_tensor_to_array_real64_2d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real64), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real64_2d
+
+  !> Return the array data associated with a Torch tensor of rank 3 and data type `real64`
+  subroutine torch_tensor_to_array_real64_3d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real64), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real64_3d
+
+  !> Return the array data associated with a Torch tensor of rank 4 and data type `real64`
+  subroutine torch_tensor_to_array_real64_4d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : real64
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    real(kind=real64), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_real64_4d
+
+
 end module ftorch
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index b90b7f73..fd03fd35 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -26,7 +26,7 @@ $:'integer' if PRECISION[:3] == 'int' else 'real'
 module ftorch
 
   use, intrinsic :: iso_c_binding, only: c_int, c_int8_t, c_int16_t, c_int32_t, c_int64_t, c_int64_t, &
-                                         c_float, c_double, c_char, c_ptr, c_null_ptr
+                                         c_float, c_double, c_char, c_ptr, c_null_ptr, c_f_pointer
   use, intrinsic :: iso_fortran_env, only: int8, int16, int32, int64, real32, real64
 
   implicit none
@@ -72,6 +72,15 @@ module ftorch
     #:endfor
   end interface
 
+  !> Interface for directing `torch_tensor_to_array` to possible input types and ranks
+  interface torch_tensor_to_array
+    #:for PREC in PRECISIONS
+    #:for RANK in RANKS
+    module procedure torch_tensor_to_array_${PREC}$_${RANK}$d
+    #:endfor
+    #:endfor
+  end interface
+
   !> Interface for deleting generic torch objects
   interface torch_delete
     module procedure torch_model_delete
@@ -99,6 +108,16 @@ module ftorch
     end function torch_from_blob_c
   end interface
 
+  interface
+    function torch_to_blob_c(tensor, dtype) result(data) &
+        bind(c, name = 'torch_to_blob')
+      use, intrinsic :: iso_c_binding, only : c_int, c_ptr
+      type(c_ptr), value, intent(in)    :: tensor
+      integer(c_int), value, intent(in) :: dtype
+      type(c_ptr)                       :: data
+    end function torch_to_blob_c
+  end interface
+
 contains
 
   !> Returns a tensor filled with the scalar value 0.
@@ -489,4 +508,26 @@ contains
   #:endfor
   #:endfor
 
+  #:for PREC in PRECISIONS
+  #:for RANK in RANKS
+  !> Return the array data associated with a Torch tensor of rank ${RANK}$ and data type `${PREC}$`
+  subroutine torch_tensor_to_array_${PREC}$_${RANK}$d(tensor, data_out)
+    use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
+    use, intrinsic :: iso_fortran_env, only : ${PREC}$
+    type(torch_tensor), intent(in) :: tensor !! Returned tensor
+    ${f_type(PREC)}$(kind=${PREC}$), pointer, intent(out) :: data_out${ranksuffix(RANK)}$ !! Pointer to tensor data
+    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$ !! Data type
+
+    ! Local data
+    integer(c_int64_t)        :: c_tensor_shape(${RANK}$)           !! Shape of the tensor
+    type(c_ptr) :: cptr
+
+    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    c_tensor_shape = shape(data_out)
+    call c_f_pointer(cptr, data_out, c_tensor_shape)
+  end subroutine torch_tensor_to_array_${PREC}$_${RANK}$d
+
+  #:endfor
+  #:endfor
+
 end module ftorch

From 684a71129f8d3c964e629742c7f62d0c4ac8afa2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 22 Jul 2024 17:48:20 +0100
Subject: [PATCH 18/43] Use torch_tensor_to_array in example 1

---
 examples/1_SimpleNet/simplenet_infer_fortran.f90 | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index aacf06b7..0835e3a9 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -16,7 +16,8 @@ program inference
 
    ! Set up Fortran data structures
    real(wp), dimension(5), target :: in_data
-   real(wp), dimension(5), target :: out_data
+   real(wp), dimension(5), target :: dummy_data
+   real(wp), dimension(:), pointer :: out_data
    integer :: tensor_layout(1) = [1]
 
    ! Set up Torch data structures
@@ -35,18 +36,26 @@ program inference
    ! Initialise data
    in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
 
-   ! Create Torch input/output tensors from the above arrays
+   ! Create Torch input tensor from the above array
    call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout, torch_kCPU)
-   call torch_tensor_from_array(out_tensors(1), out_data, tensor_layout, torch_kCPU)
+
+   ! Create an empty Torch output tensor
+   ! TODO: Drop dummy_data and initialise out_tensors with torch_tensor_empty
+   call torch_tensor_from_array(out_tensors(1), dummy_data, tensor_layout, torch_kCPU)
 
    ! Load ML model
    call torch_model_load(model, args(1))
 
    ! Infer
    call torch_model_forward(model, in_tensors, out_tensors)
+
+   ! Extract the output as a Fortran array
+   allocate(out_data(5))
+   call torch_tensor_to_array(out_tensors(1), out_data)
    write (*,*) out_data(:)
 
    ! Cleanup
+   nullify(out_data)
    call torch_delete(model)
    call torch_delete(in_tensors)
    call torch_delete(out_tensors)

From d35218eff086e65bb7e2c7a5b2ca8d6b880772a2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 11:38:12 +0100
Subject: [PATCH 19/43] Use correct data types; raise errors for unsupported
 cases

---
 src/ctorch.cpp | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index fd99895e..252c3f19 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -7,7 +7,8 @@ constexpr auto get_dtype(torch_data_t dtype)
 {
   switch (dtype) {
   case torch_kUInt8:
-    return torch::kUInt8;
+    std::cerr << "[WARNING]: uint8 not supported" << std::endl;
+    exit(EXIT_FAILURE);
   case torch_kInt8:
     return torch::kInt8;
   case torch_kInt16:
@@ -17,7 +18,8 @@ constexpr auto get_dtype(torch_data_t dtype)
   case torch_kInt64:
     return torch::kInt64;
   case torch_kFloat16:
-    return torch::kFloat16;
+    std::cerr << "[WARNING]: float16 not supported" << std::endl;
+    exit(EXIT_FAILURE);
   case torch_kFloat32:
     return torch::kFloat32;
   case torch_kFloat64:
@@ -173,35 +175,34 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
   return tensor;
 }
 
-// FIXME: Why can't we use torch::kUInt8, etc.?
 void* torch_to_blob(const torch_tensor_t tensor, const torch_data_t dtype)
 {
     auto t = reinterpret_cast<torch::Tensor* const>(tensor);
     void* raw_ptr;
     switch (dtype) {
     case torch_kUInt8:
-      raw_ptr = (void*) t->data_ptr<int>();
-      break;
+      std::cerr << "[WARNING]: uint8 not supported" << std::endl;
+      exit(EXIT_FAILURE);
     case torch_kInt8:
-      raw_ptr = (void*) t->data_ptr<int>();
+      raw_ptr = (void*) t->data_ptr<int8_t>();
       break;
     case torch_kInt16:
-      raw_ptr = (void*) t->data_ptr<int>();
+      raw_ptr = (void*) t->data_ptr<int16_t>();
       break;
     case torch_kInt32:
-      raw_ptr = (void*) t->data_ptr<int>();
+      raw_ptr = (void*) t->data_ptr<int32_t>();
       break;
     case torch_kInt64:
-      raw_ptr = (void*) t->data_ptr<int>();
+      raw_ptr = (void*) t->data_ptr<int64_t>();
       break;
     case torch_kFloat16:
-      raw_ptr = (void*) t->data_ptr<float>();
-      break;
+      std::cerr << "[WARNING]: float16 not supported" << std::endl;
+      exit(EXIT_FAILURE);
     case torch_kFloat32:
       raw_ptr = (void*) t->data_ptr<float>();
       break;
     case torch_kFloat64:
-      raw_ptr = (void*) t->data_ptr<float>();
+      raw_ptr = (void*) t->data_ptr<double>();
       break;
     default:
       std::cerr << "[WARNING]: unknown data type" << std::endl;

From b5f53f2069371beb6c5b7ddaec2edb4ff8bc9dbd Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 11:44:01 +0100
Subject: [PATCH 20/43] Revert changes to example 1

---
 examples/1_SimpleNet/simplenet_infer_fortran.f90 | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 0835e3a9..aacf06b7 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -16,8 +16,7 @@ program inference
 
    ! Set up Fortran data structures
    real(wp), dimension(5), target :: in_data
-   real(wp), dimension(5), target :: dummy_data
-   real(wp), dimension(:), pointer :: out_data
+   real(wp), dimension(5), target :: out_data
    integer :: tensor_layout(1) = [1]
 
    ! Set up Torch data structures
@@ -36,26 +35,18 @@ program inference
    ! Initialise data
    in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
 
-   ! Create Torch input tensor from the above array
+   ! Create Torch input/output tensors from the above arrays
    call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout, torch_kCPU)
-
-   ! Create an empty Torch output tensor
-   ! TODO: Drop dummy_data and initialise out_tensors with torch_tensor_empty
-   call torch_tensor_from_array(out_tensors(1), dummy_data, tensor_layout, torch_kCPU)
+   call torch_tensor_from_array(out_tensors(1), out_data, tensor_layout, torch_kCPU)
 
    ! Load ML model
    call torch_model_load(model, args(1))
 
    ! Infer
    call torch_model_forward(model, in_tensors, out_tensors)
-
-   ! Extract the output as a Fortran array
-   allocate(out_data(5))
-   call torch_tensor_to_array(out_tensors(1), out_data)
    write (*,*) out_data(:)
 
    ! Cleanup
-   nullify(out_data)
    call torch_delete(model)
    call torch_delete(in_tensors)
    call torch_delete(out_tensors)

From cf9a00693eefbbbe973d72cea9608f7b0f2855b6 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 12:01:51 +0100
Subject: [PATCH 21/43] Add beginnings of autograd demo

---
 examples/5_Autograd/CMakeLists.txt   | 37 +++++++++++++++
 examples/5_Autograd/README.md        | 68 ++++++++++++++++++++++++++++
 examples/5_Autograd/autograd.f90     | 35 ++++++++++++++
 examples/5_Autograd/autograd.py      | 16 +++++++
 examples/5_Autograd/requirements.txt |  0
 examples/CMakeLists.txt              |  1 +
 run_integration_tests.sh             |  2 +-
 src/CMakeLists.txt                   |  3 ++
 8 files changed, 161 insertions(+), 1 deletion(-)
 create mode 100644 examples/5_Autograd/CMakeLists.txt
 create mode 100644 examples/5_Autograd/README.md
 create mode 100644 examples/5_Autograd/autograd.f90
 create mode 100755 examples/5_Autograd/autograd.py
 create mode 100644 examples/5_Autograd/requirements.txt

diff --git a/examples/5_Autograd/CMakeLists.txt b/examples/5_Autograd/CMakeLists.txt
new file mode 100644
index 00000000..baeb239f
--- /dev/null
+++ b/examples/5_Autograd/CMakeLists.txt
@@ -0,0 +1,37 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+#policy CMP0076 - target_sources source files are relative to file where target_sources is run
+cmake_policy (SET CMP0076 NEW)
+
+set(PROJECT_NAME AutogradExample)
+
+project(${PROJECT_NAME} LANGUAGES Fortran)
+
+# Build in Debug mode if not specified
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Debug CACHE STRING "" FORCE)
+endif()
+
+find_package(FTorch)
+message(STATUS "Building with Fortran PyTorch coupling")
+
+# Fortran example
+add_executable(autograd autograd.f90)
+target_link_libraries(autograd PRIVATE FTorch::ftorch)
+
+# Integration testing
+if(CMAKE_BUILD_TESTS)
+  include(CTest)
+
+  # 1. Check the Python Autograd script runs successfully
+  add_test(NAME pyautograd
+    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/autograd.py)
+  set_tests_properties(pyautograd PROPERTIES PASS_REGULAR_EXPRESSION
+    "-12.,  65.")
+
+  # 2. Check the Fortran Autograd script runs successfully
+  add_test(NAME fautograd
+    COMMAND autograd
+    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
+  set_tests_properties(fautograd PROPERTIES PASS_REGULAR_EXPRESSION
+    "2.00000000       3.00000000")
+endif()
diff --git a/examples/5_Autograd/README.md b/examples/5_Autograd/README.md
new file mode 100644
index 00000000..884db584
--- /dev/null
+++ b/examples/5_Autograd/README.md
@@ -0,0 +1,68 @@
+# Example 5 - Autograd
+
+This example will demonstrate automatic differentation in FTorch by leveraging
+PyTorch's Autograd module.
+
+By exposing Autograd in Fortran, FTorch will be able to compute derivatives of
+expressions involving `torch_tensor`s.
+
+## Description
+
+A Python demo is copied from the PyTorch documentation as `autograd.py`, which
+shows how to compute the gradient of an arithmetic combination of Torch Tensors.
+
+The demo will be replicated in Fortran as `autograd.f90`, to show how to do the
+same thing using FTorch. This demo is currently unfinished and simply
+demonstrates how to use the `torch_tensor_to_array` subroutine to extract a
+Fortran array from a `torch_tensor`.
+
+## Dependencies
+
+To run this example requires:
+
+- CMake
+- Fortran compiler
+- FTorch (installed as described in main package)
+- Python 3
+
+## Running
+
+To run this example install FTorch as described in the main documentation.
+Then from this directory create a virtual environment and install the necessary
+Python modules:
+```
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+Run the Python version of the demo with
+```
+python3 autograd.py
+```
+This performs some arithmetic on two input tensors [2.0, 3.0] and [6.0, 4.0] to
+produce the result:
+```
+tensor([-12., 65.], grad_fn=<SubBackward0>)
+```
+where `<SubBackward0>` refers to the method used for computing the gradient.
+
+
+To run the Fortran version of the demo we need to compile with (for example)
+```
+mkdir build
+cd build
+cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> -DCMAKE_BUILD_TYPE=Release
+cmake --build .
+```
+
+To run the compiled code, simply use
+```
+./autograd
+```
+Currently, the example simply constructs a Torch Tensor from the array and then
+extracts it again. As such, running the above should print the same values as
+the input:
+```
+   2.00000000       3.00000000
+```
diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
new file mode 100644
index 00000000..16f77162
--- /dev/null
+++ b/examples/5_Autograd/autograd.f90
@@ -0,0 +1,35 @@
+program example
+
+  ! Import precision info from iso
+  use, intrinsic :: iso_fortran_env, only : sp => real32
+
+  ! Import our library for interfacing with PyTorch's Autograd module
+  use ftorch
+
+  implicit none
+
+  ! Set working precision for reals
+  integer, parameter :: wp = sp
+
+  ! Set up Fortran data structures
+  real(wp), dimension(2), target :: in_data
+  real(wp), dimension(:), pointer :: out_data
+  integer :: tensor_layout(1) = [1]
+
+  ! Set up Torch data structures
+  type(torch_tensor) :: a
+
+  ! Construct a Torch Tensor from a Fortran array
+  in_data(:) = [2.0, 3.0]
+  call torch_tensor_from_array(a, in_data, tensor_layout, torch_kCPU)
+
+  ! Extract a Fortran array from a Torch tensor
+  allocate(out_data(2))
+  call torch_tensor_to_array(a, out_data)
+  write (*,*) "a = ", out_data(:)
+
+  ! Cleanup
+  nullify(out_data)
+  call torch_tensor_delete(a)
+
+end program example
diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
new file mode 100755
index 00000000..1c57c19b
--- /dev/null
+++ b/examples/5_Autograd/autograd.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+"""Autograd demo taken from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html."""
+
+import torch
+
+a = torch.tensor([2.0, 3.0], requires_grad=True)
+b = torch.tensor([6.0, 4.0], requires_grad=True)
+
+Q = 3 * a**3 - b**2
+print(Q)
+
+external_grad = torch.tensor([1.0, 1.0])
+Q.backward(gradient=external_grad)
+
+assert torch.allclose(9 * a**2, a.grad)
+assert torch.allclose(-2 * b, b.grad)
diff --git a/examples/5_Autograd/requirements.txt b/examples/5_Autograd/requirements.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b81b811f..a993299d 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,4 +3,5 @@ if(CMAKE_BUILD_TESTS)
   add_subdirectory(2_ResNet18)
   # add_subdirectory(3_MultiGPU)
   add_subdirectory(4_MultiIO)
+  add_subdirectory(5_Autograd)
 endif()
diff --git a/run_integration_tests.sh b/run_integration_tests.sh
index 281c2f54..a9948ff2 100755
--- a/run_integration_tests.sh
+++ b/run_integration_tests.sh
@@ -12,7 +12,7 @@
 set -eu
 
 CTEST_ARGS=$@
-EXAMPLES="1_SimpleNet 2_ResNet18 4_MultiIO"
+EXAMPLES="1_SimpleNet 2_ResNet18 4_MultiIO 5_Autograd"
 BUILD_DIR=src/build
 
 for EXAMPLE in ${EXAMPLES}
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9dc63847..fdc9e6d3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -100,5 +100,8 @@ if(CMAKE_BUILD_TESTS)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/4_MultiIO
     DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples
     )
+  file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/5_Autograd
+    DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples
+    )
   add_subdirectory(test/examples)
 endif()

From b8b4840e3c8df94c654511da8d00ed05da6c8153 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 12:04:04 +0100
Subject: [PATCH 22/43] Docs for example 5

---
 examples/5_Autograd/README.md | 4 ++--
 pages/examples.md             | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/5_Autograd/README.md b/examples/5_Autograd/README.md
index 884db584..3b3bd3e6 100644
--- a/examples/5_Autograd/README.md
+++ b/examples/5_Autograd/README.md
@@ -1,7 +1,7 @@
 # Example 5 - Autograd
 
-This example will demonstrate automatic differentation in FTorch by leveraging
-PyTorch's Autograd module.
+**This example is currently under development.** Eventually, it will demonstrate
+automatic differentation in FTorch by leveraging PyTorch's Autograd module.
 
 By exposing Autograd in Fortran, FTorch will be able to compute derivatives of
 expressions involving `torch_tensor`s.
diff --git a/pages/examples.md b/pages/examples.md
index 5b3833b8..30b274b2 100644
--- a/pages/examples.md
+++ b/pages/examples.md
@@ -186,3 +186,10 @@ data to multiple GPU devices.
 [This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/4_MultiIO)
 considers a variant of the SimpleNet demo, which demonstrates how to account for
 multiple input tensors and multiple output tensors.
+
+#### 5) Autograd
+
+[This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/5_Autograd)
+is currently under development. Eventually, it will demonstrate how to perform
+automatic differentiation in FTorch by leveraging PyTorch's Autograd module.
+Currently, it just demonstrates how to use `torch_tensor_to_array`.

From 806730c55e03b06ff48aa96ec13e4467d9b2a99e Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 18:29:56 +0100
Subject: [PATCH 23/43] More detail on uint8 and float16 not being supported

---
 src/ctorch.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 252c3f19..f7e16fcf 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -7,7 +7,8 @@ constexpr auto get_dtype(torch_data_t dtype)
 {
   switch (dtype) {
   case torch_kUInt8:
-    std::cerr << "[WARNING]: uint8 not supported" << std::endl;
+    std::cerr << "[WARNING]: uint8 not supported in Fortran" << std::endl;
+    // See https://gcc.gnu.org/onlinedocs/gfortran/ISO_005fFORTRAN_005fENV.html
     exit(EXIT_FAILURE);
   case torch_kInt8:
     return torch::kInt8;
@@ -18,7 +19,8 @@ constexpr auto get_dtype(torch_data_t dtype)
   case torch_kInt64:
     return torch::kInt64;
   case torch_kFloat16:
-    std::cerr << "[WARNING]: float16 not supported" << std::endl;
+    std::cerr << "[WARNING]: float16 not supported in Fortran" << std::endl;
+    // See https://gcc.gnu.org/onlinedocs/gfortran/ISO_005fFORTRAN_005fENV.html
     exit(EXIT_FAILURE);
   case torch_kFloat32:
     return torch::kFloat32;

From 0f6ee05722b051f0a989dd0f535e4584efd7ff60 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 23 Jul 2024 18:32:36 +0100
Subject: [PATCH 24/43] Add notes on float types

---
 src/ctorch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index f7e16fcf..5794ca00 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -199,12 +199,15 @@ void* torch_to_blob(const torch_tensor_t tensor, const torch_data_t dtype)
       break;
     case torch_kFloat16:
       std::cerr << "[WARNING]: float16 not supported" << std::endl;
+      // NOTE: std::float16_t is available but only with C++23
       exit(EXIT_FAILURE);
     case torch_kFloat32:
       raw_ptr = (void*) t->data_ptr<float>();
+      // NOTE: std::float32_t is available but only with C++23
       break;
     case torch_kFloat64:
       raw_ptr = (void*) t->data_ptr<double>();
+      // NOTE: std::float64_t is available but only with C++23
       break;
     default:
       std::cerr << "[WARNING]: unknown data type" << std::endl;

From e2f6a9d473af461540de51ec64544d53728a9e92 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 24 Jul 2024 10:15:34 +0100
Subject: [PATCH 25/43] Handle allocation of pointer array

---
 examples/5_Autograd/autograd.f90 |   3 +-
 src/ftorch.f90                   | 624 +++++++++++++++++++++++++------
 src/ftorch.fypp                  |  26 +-
 3 files changed, 526 insertions(+), 127 deletions(-)

diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index 16f77162..8e159741 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -24,8 +24,7 @@ program example
   call torch_tensor_from_array(a, in_data, tensor_layout, torch_kCPU)
 
   ! Extract a Fortran array from a Torch tensor
-  allocate(out_data(2))
-  call torch_tensor_to_array(a, out_data)
+  call torch_tensor_to_array(a, out_data, shape(in_data))
   write (*,*) "a = ", out_data(:)
 
   ! Cleanup
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 18620076..84b17031 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -1768,411 +1768,795 @@ end subroutine torch_tensor_from_array_real64_4d
 
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `int8`
-  subroutine torch_tensor_to_array_int8_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_int8_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int8), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int8_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `int8`
-  subroutine torch_tensor_to_array_int8_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_int8_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int8), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int8_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `int8`
-  subroutine torch_tensor_to_array_int8_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_int8_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int8), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int8_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `int8`
-  subroutine torch_tensor_to_array_int8_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_int8_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int8), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt8 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int8_4d
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `int16`
-  subroutine torch_tensor_to_array_int16_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_int16_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int16), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int16_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `int16`
-  subroutine torch_tensor_to_array_int16_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_int16_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int16), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int16_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `int16`
-  subroutine torch_tensor_to_array_int16_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_int16_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int16), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int16_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `int16`
-  subroutine torch_tensor_to_array_int16_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_int16_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int16), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt16 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int16_4d
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `int32`
-  subroutine torch_tensor_to_array_int32_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_int32_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int32), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int32_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `int32`
-  subroutine torch_tensor_to_array_int32_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_int32_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int32), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int32_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `int32`
-  subroutine torch_tensor_to_array_int32_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_int32_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int32), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int32_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `int32`
-  subroutine torch_tensor_to_array_int32_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_int32_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int32), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int32_4d
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `int64`
-  subroutine torch_tensor_to_array_int64_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_int64_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int64), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int64_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `int64`
-  subroutine torch_tensor_to_array_int64_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_int64_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int64), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int64_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `int64`
-  subroutine torch_tensor_to_array_int64_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_int64_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int64), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int64_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `int64`
-  subroutine torch_tensor_to_array_int64_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_int64_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     integer(kind=int64), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kInt64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_int64_4d
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `real32`
-  subroutine torch_tensor_to_array_real32_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_real32_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real32), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real32_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `real32`
-  subroutine torch_tensor_to_array_real32_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_real32_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real32), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real32_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `real32`
-  subroutine torch_tensor_to_array_real32_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_real32_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real32), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real32_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `real32`
-  subroutine torch_tensor_to_array_real32_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_real32_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real32), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat32 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real32_4d
 
   !> Return the array data associated with a Torch tensor of rank 1 and data type `real64`
-  subroutine torch_tensor_to_array_real64_1d(tensor, data_out)
+  subroutine torch_tensor_to_array_real64_1d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real64), pointer, intent(out) :: data_out(:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer, optional, intent(in) :: sizes(1) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(1)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real64_1d
 
   !> Return the array data associated with a Torch tensor of rank 2 and data type `real64`
-  subroutine torch_tensor_to_array_real64_2d(tensor, data_out)
+  subroutine torch_tensor_to_array_real64_2d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real64), pointer, intent(out) :: data_out(:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer, optional, intent(in) :: sizes(2) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(2)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real64_2d
 
   !> Return the array data associated with a Torch tensor of rank 3 and data type `real64`
-  subroutine torch_tensor_to_array_real64_3d(tensor, data_out)
+  subroutine torch_tensor_to_array_real64_3d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real64), pointer, intent(out) :: data_out(:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer, optional, intent(in) :: sizes(3) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(3)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real64_3d
 
   !> Return the array data associated with a Torch tensor of rank 4 and data type `real64`
-  subroutine torch_tensor_to_array_real64_4d(tensor, data_out)
+  subroutine torch_tensor_to_array_real64_4d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     real(kind=real64), pointer, intent(out) :: data_out(:,:,:,:) !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
+    integer, optional, intent(in) :: sizes(4) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(4)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = torch_kFloat64 !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1),sizes(2),sizes(3),sizes(4)))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_real64_4d
 
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index fd03fd35..4efbfc38 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -511,20 +511,36 @@ contains
   #:for PREC in PRECISIONS
   #:for RANK in RANKS
   !> Return the array data associated with a Torch tensor of rank ${RANK}$ and data type `${PREC}$`
-  subroutine torch_tensor_to_array_${PREC}$_${RANK}$d(tensor, data_out)
+  subroutine torch_tensor_to_array_${PREC}$_${RANK}$d(tensor, data_out, sizes)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
     type(torch_tensor), intent(in) :: tensor !! Returned tensor
     ${f_type(PREC)}$(kind=${PREC}$), pointer, intent(out) :: data_out${ranksuffix(RANK)}$ !! Pointer to tensor data
-    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$ !! Data type
+    integer, optional, intent(in) :: sizes(${RANK}$) !! Number of entries for each rank
 
     ! Local data
-    integer(c_int64_t)        :: c_tensor_shape(${RANK}$)           !! Shape of the tensor
+    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$ !! Data type
     type(c_ptr) :: cptr
 
+    ! Handle allocation of the pointer array
+    if (present(sizes)) then
+      if (all(shape(data_out) == 0)) then
+        allocate(data_out(sizes(1)#{for i in range(1,RANK)}#,sizes(${i+1}$)#{endfor}#))
+      else if (any(shape(data_out) /= sizes)) then
+        write (*,*) "[ERROR]: Array allocated with wrong shape"
+        stop
+      end if
+    else
+      if ((.not. associated(data_out)) .or. (all(shape(data_out) == 0))) then
+        write (*,*) "[ERROR]: Pointer array has not been allocated"
+        stop
+      end if
+    end if
+
+    ! Have the data_out array point to the Tensor data
     cptr = torch_to_blob_c(tensor%p, c_dtype)
-    c_tensor_shape = shape(data_out)
-    call c_f_pointer(cptr, data_out, c_tensor_shape)
+    call c_f_pointer(cptr, data_out, sizes)
+
   end subroutine torch_tensor_to_array_${PREC}$_${RANK}$d
 
   #:endfor

From cdd4433f15e2ad21d409e29547fb86f100cc54f7 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 29 Jul 2024 12:00:40 +0100
Subject: [PATCH 26/43] Merge fixes

---
 examples/5_Autograd/CMakeLists.txt |  2 +-
 examples/5_Autograd/autograd.f90   |  2 +-
 src/ftorch.f90                     | 48 +++++++++++++++---------------
 src/ftorch.fypp                    |  2 +-
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/examples/5_Autograd/CMakeLists.txt b/examples/5_Autograd/CMakeLists.txt
index 1f16b9ec..bc84c8e3 100644
--- a/examples/5_Autograd/CMakeLists.txt
+++ b/examples/5_Autograd/CMakeLists.txt
@@ -33,5 +33,5 @@ if(CMAKE_BUILD_TESTS)
     COMMAND autograd
     WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
   set_tests_properties(fautograd PROPERTIES PASS_REGULAR_EXPRESSION
-    "-12.00000000      65.00000000")
+    "-12.0000000       65.0000000")
 endif()
diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
index e45ead46..4b537cf6 100644
--- a/examples/5_Autograd/autograd.f90
+++ b/examples/5_Autograd/autograd.f90
@@ -35,7 +35,7 @@ program example
   Q = 3 * a ** 3 - b ** 2
 
   ! Extract output Fortran array from Q tensor
-  call torch_tensor_to_array(Q, out_data, shape(in_data))
+  call torch_tensor_to_array(Q, out_data, shape(in_data1))
   write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:)
 
   ! Check a and b are unchanged by the arithmetic operations
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 2c7ff978..3d0246a3 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -968,7 +968,7 @@ end subroutine torch_model_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1022,7 +1022,7 @@ end subroutine torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1076,7 +1076,7 @@ end subroutine torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1130,7 +1130,7 @@ end subroutine torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
   subroutine torch_tensor_from_array_int8_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -1184,7 +1184,7 @@ end subroutine torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1238,7 +1238,7 @@ end subroutine torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1292,7 +1292,7 @@ end subroutine torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1346,7 +1346,7 @@ end subroutine torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
   subroutine torch_tensor_from_array_int16_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -1400,7 +1400,7 @@ end subroutine torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1454,7 +1454,7 @@ end subroutine torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1508,7 +1508,7 @@ end subroutine torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1562,7 +1562,7 @@ end subroutine torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
   subroutine torch_tensor_from_array_int32_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -1616,7 +1616,7 @@ end subroutine torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1670,7 +1670,7 @@ end subroutine torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1724,7 +1724,7 @@ end subroutine torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1778,7 +1778,7 @@ end subroutine torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
   subroutine torch_tensor_from_array_int64_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -1832,7 +1832,7 @@ end subroutine torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1886,7 +1886,7 @@ end subroutine torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1940,7 +1940,7 @@ end subroutine torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1994,7 +1994,7 @@ end subroutine torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
   subroutine torch_tensor_from_array_real32_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -2048,7 +2048,7 @@ end subroutine torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_1d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2102,7 +2102,7 @@ end subroutine torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_2d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2156,7 +2156,7 @@ end subroutine torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_3d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -2210,7 +2210,7 @@ end subroutine torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
   subroutine torch_tensor_from_array_real64_4d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 82e2ceed..ef9c796c 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -644,7 +644,7 @@ contains
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
   subroutine torch_tensor_from_array_${PREC}$_${RANK}$d(tensor, data_in, layout, &
-                                                        c_device_type, device_index, requires_grad)
+                                                        device_type, device_index, requires_grad)
     use, intrinsic :: iso_c_binding, only : c_bool, c_float, c_int, c_int64_t, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 

From 6223f36acd6594176a50f7a1dbd5c3a202d639ae Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 31 Oct 2024 12:10:14 +0000
Subject: [PATCH 27/43] Update autograd example

---
 examples/5_Autograd/CMakeLists.txt   | 37 --------------
 examples/5_Autograd/README.md        | 68 -------------------------
 examples/5_Autograd/autograd.f90     | 54 --------------------
 examples/5_Autograd/autograd.py      | 16 ------
 examples/5_Autograd/requirements.txt |  0
 examples/6_Autograd/autograd.f90     | 74 ++++++++++++++++++++--------
 6 files changed, 54 insertions(+), 195 deletions(-)
 delete mode 100644 examples/5_Autograd/CMakeLists.txt
 delete mode 100644 examples/5_Autograd/README.md
 delete mode 100644 examples/5_Autograd/autograd.f90
 delete mode 100755 examples/5_Autograd/autograd.py
 delete mode 100644 examples/5_Autograd/requirements.txt

diff --git a/examples/5_Autograd/CMakeLists.txt b/examples/5_Autograd/CMakeLists.txt
deleted file mode 100644
index bc84c8e3..00000000
--- a/examples/5_Autograd/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
-#policy CMP0076 - target_sources source files are relative to file where target_sources is run
-cmake_policy (SET CMP0076 NEW)
-
-set(PROJECT_NAME AutogradExample)
-
-project(${PROJECT_NAME} LANGUAGES Fortran)
-
-# Build in Debug mode if not specified
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Debug CACHE STRING "" FORCE)
-endif()
-
-find_package(FTorch)
-message(STATUS "Building with Fortran PyTorch coupling")
-
-# Fortran example
-add_executable(autograd autograd.f90)
-target_link_libraries(autograd PRIVATE FTorch::ftorch)
-
-# Integration testing
-if(CMAKE_BUILD_TESTS)
-  include(CTest)
-
-  # 1. Check the Python Autograd script runs successfully
-  add_test(NAME pyautograd
-    COMMAND ${PYTHON_EXECUTABLE} ${PROJECT_SOURCE_DIR}/autograd.py)
-  set_tests_properties(pyautograd PROPERTIES PASS_REGULAR_EXPRESSION
-    "-12.,  65.")
-
-  # 2. Check the Fortran Autograd script runs successfully
-  add_test(NAME fautograd
-    COMMAND autograd
-    WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
-  set_tests_properties(fautograd PROPERTIES PASS_REGULAR_EXPRESSION
-    "-12.0000000       65.0000000")
-endif()
diff --git a/examples/5_Autograd/README.md b/examples/5_Autograd/README.md
deleted file mode 100644
index 3b3bd3e6..00000000
--- a/examples/5_Autograd/README.md
+++ /dev/null
@@ -1,68 +0,0 @@
-# Example 5 - Autograd
-
-**This example is currently under development.** Eventually, it will demonstrate
-automatic differentation in FTorch by leveraging PyTorch's Autograd module.
-
-By exposing Autograd in Fortran, FTorch will be able to compute derivatives of
-expressions involving `torch_tensor`s.
-
-## Description
-
-A Python demo is copied from the PyTorch documentation as `autograd.py`, which
-shows how to compute the gradient of an arithmetic combination of Torch Tensors.
-
-The demo will be replicated in Fortran as `autograd.f90`, to show how to do the
-same thing using FTorch. This demo is currently unfinished and simply
-demonstrates how to use the `torch_tensor_to_array` subroutine to extract a
-Fortran array from a `torch_tensor`.
-
-## Dependencies
-
-To run this example requires:
-
-- CMake
-- Fortran compiler
-- FTorch (installed as described in main package)
-- Python 3
-
-## Running
-
-To run this example install FTorch as described in the main documentation.
-Then from this directory create a virtual environment and install the necessary
-Python modules:
-```
-python3 -m venv venv
-source venv/bin/activate
-pip install -r requirements.txt
-```
-
-Run the Python version of the demo with
-```
-python3 autograd.py
-```
-This performs some arithmetic on two input tensors [2.0, 3.0] and [6.0, 4.0] to
-produce the result:
-```
-tensor([-12., 65.], grad_fn=<SubBackward0>)
-```
-where `<SubBackward0>` refers to the method used for computing the gradient.
-
-
-To run the Fortran version of the demo we need to compile with (for example)
-```
-mkdir build
-cd build
-cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> -DCMAKE_BUILD_TYPE=Release
-cmake --build .
-```
-
-To run the compiled code, simply use
-```
-./autograd
-```
-Currently, the example simply constructs a Torch Tensor from the array and then
-extracts it again. As such, running the above should print the same values as
-the input:
-```
-   2.00000000       3.00000000
-```
diff --git a/examples/5_Autograd/autograd.f90 b/examples/5_Autograd/autograd.f90
deleted file mode 100644
index 4b537cf6..00000000
--- a/examples/5_Autograd/autograd.f90
+++ /dev/null
@@ -1,54 +0,0 @@
-program example
-
-  ! Import precision info from iso
-  use, intrinsic :: iso_fortran_env, only : sp => real32
-
-  ! Import our library for interfacing with PyTorch's Autograd module
-  use ftorch
-
-  implicit none
-
-  ! Set working precision for reals
-  integer, parameter :: wp = sp
-
-   ! Set up Fortran data structures
-  real(wp), dimension(2), target :: in_data1
-  real(wp), dimension(2), target :: in_data2
-  real(wp), dimension(:), pointer :: out_data
-  integer :: tensor_layout(1) = [1]
-
-   ! Set up Torch data structures
-  type(torch_tensor) :: a, b, Q
-
-   ! Initialise data
-  in_data1(:) = [2.0, 3.0]
-  in_data2(:) = [6.0, 4.0]
-
-  ! FIXME: requires_grad=.true.
-  call torch_tensor_from_array(a, in_data1, tensor_layout, torch_kCPU)
-  call torch_tensor_from_array(b, in_data2, tensor_layout, torch_kCPU)
-  call torch_tensor_from_array(Q, out_data, tensor_layout, torch_kCPU)
-
-  ! Check arithmetic operations work for torch_tensors
-  write (*,*) "a = ", in_data1(:)
-  write (*,*) "b = ", in_data2(:)
-  Q = 3 * a ** 3 - b ** 2
-
-  ! Extract output Fortran array from Q tensor
-  call torch_tensor_to_array(Q, out_data, shape(in_data1))
-  write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:)
-
-  ! Check a and b are unchanged by the arithmetic operations
-  write (*,*) "a = ", in_data1(:)
-  write (*,*) "b = ", in_data2(:)
-
-  ! TODO: Backward
-  !   Requires API extension
-
-  ! Cleanup
-  nullify(out_data)
-  call torch_tensor_delete(a)
-  call torch_tensor_delete(b)
-  call torch_tensor_delete(Q)
-
-end program example
diff --git a/examples/5_Autograd/autograd.py b/examples/5_Autograd/autograd.py
deleted file mode 100755
index 1c57c19b..00000000
--- a/examples/5_Autograd/autograd.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python3
-"""Autograd demo taken from https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html."""
-
-import torch
-
-a = torch.tensor([2.0, 3.0], requires_grad=True)
-b = torch.tensor([6.0, 4.0], requires_grad=True)
-
-Q = 3 * a**3 - b**2
-print(Q)
-
-external_grad = torch.tensor([1.0, 1.0])
-Q.backward(gradient=external_grad)
-
-assert torch.allclose(9 * a**2, a.grad)
-assert torch.allclose(-2 * b, b.grad)
diff --git a/examples/5_Autograd/requirements.txt b/examples/5_Autograd/requirements.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index b44e9906..c3b0162b 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -15,8 +15,9 @@ program example
   integer, parameter :: wp = sp
 
   ! Set up Fortran data structures
-  integer, parameter :: n=2, m=5
-  real(wp), dimension(n,m), target :: in_data
+  integer, parameter :: n=2, m=1
+  real(wp), dimension(n,m), target :: in_data1
+  real(wp), dimension(n,m), target :: in_data2
   real(wp), dimension(:,:), pointer :: out_data
   real(wp), dimension(n,m) :: expected
   integer :: tensor_layout(2) = [1, 2]
@@ -26,45 +27,78 @@ program example
   logical :: test_pass
 
   ! Set up Torch data structures
-  type(torch_tensor) :: tensor
+  type(torch_tensor) :: a, b, Q
 
-  ! initialize in_data with some fake data
-  do j = 1, m
-    do i = 1, n
-      in_data(i,j) = ((i-1)*m + j) * 1.0_wp
-    end do
-  end do
+  ! Initialise input arrays as in Python example
+  in_data1(:,1) = [2.0, 3.0]
+  in_data2(:,1) = [6.0, 4.0]
 
   ! Construct a Torch Tensor from a Fortran array
-  call torch_tensor_from_array(tensor, in_data, tensor_layout, torch_kCPU)
+  ! TODO: Implement requires_grad=.true.
+  call torch_tensor_from_array(a, in_data1, tensor_layout, torch_kCPU)
+  call torch_tensor_from_array(b, in_data2, tensor_layout, torch_kCPU)
 
   ! check tensor rank and shape match those of in_data
-  if (tensor%get_rank() /= 2) then
+  if ((a%get_rank() /= 2) .or. (b%get_rank() /= 2)) then
     print *, "Error :: rank should be 2"
     stop 1
   end if
-  if (any(tensor%get_shape() /= [2, 5])) then
-    print *, "Error :: shape should be (2, 5)"
+  if (any(a%get_shape() /= [n, m]) .or. any(b%get_shape() /= [n, m])) then
+    write(6,"('Error :: shape should be (',i1,', ',i1,')')") n, m
     stop 1
   end if
 
+  ! Check arithmetic operations work for torch_tensors
+  write (*,*) "a = ", in_data1(:,1)
+  write (*,*) "b = ", in_data2(:,1)
+  Q = 3 * a ** 3 - b ** 2
+
   ! Extract a Fortran array from a Torch tensor
-  call torch_tensor_to_array(tensor, out_data, shape(in_data))
+  call torch_tensor_to_array(Q, out_data, shape(in_data1))
+  write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:,1)
 
   ! Check output tensor matches expected value
-  expected(:,:) = in_data
+  expected(:,1) = [-12.0, 65.0]
   test_pass = assert_real_array_2d(out_data, expected, test_name="torch_tensor_to_array", rtol=1e-5)
+  if (.not. test_pass) then
+    call clean_up()
+    print *, "Error :: out_data does not match expected value"
+    stop 999
+  end if
 
-  ! Check that the data match
+  ! Check first input array is unchanged by the arithmetic operations
+  expected(:,1) = [2.0, 3.0]
+  test_pass = assert_real_array_2d(in_data1, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
-    print *, "Error :: in_data does not match out_data"
+    call clean_up()
+    print *, "Error :: in_data1 was changed during arithmetic operations"
     stop 999
   end if
 
-  ! Cleanup
-  nullify(out_data)
-  call torch_tensor_delete(tensor)
+  ! Check second input array is unchanged by the arithmetic operations
+  expected(:,1) = [6.0, 4.0]
+  test_pass = assert_real_array_2d(in_data2, expected, test_name="torch_tensor_to_array", rtol=1e-5)
+  if (.not. test_pass) then
+    call clean_up()
+    print *, "Error :: in_data2 was changed during arithmetic operations"
+    stop 999
+  end if
+
+  ! Back-propagation
+  ! TODO: Requires API extension
 
+  ! Cleanup
+  call clean_up()
   write (*,*) "Autograd example ran successfully"
 
+  contains
+
+    ! Subroutine for freeing memory and nullifying pointers used in the example
+    subroutine clean_up()
+      nullify(out_data)
+      call torch_tensor_delete(a)
+      call torch_tensor_delete(b)
+      call torch_tensor_delete(Q)
+    end subroutine clean_up
+
 end program example

From 536480060c501ddea742bc5a0a9b8aee70d006d8 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 11 Nov 2024 09:40:30 +0000
Subject: [PATCH 28/43] Use assert_allclose in new code

---
 examples/6_Autograd/autograd.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index aaf222f4..c04cb86f 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -68,7 +68,7 @@ program example
 
   ! Check first input array is unchanged by the arithmetic operations
   expected(:,1) = [2.0, 3.0]
-  test_pass = assert_real_array_2d(in_data1, expected, test_name="torch_tensor_to_array", rtol=1e-5)
+  test_pass = assert_allclose(in_data1, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
     call clean_up()
     print *, "Error :: in_data1 was changed during arithmetic operations"
@@ -77,7 +77,7 @@ program example
 
   ! Check second input array is unchanged by the arithmetic operations
   expected(:,1) = [6.0, 4.0]
-  test_pass = assert_real_array_2d(in_data2, expected, test_name="torch_tensor_to_array", rtol=1e-5)
+  test_pass = assert_allclose(in_data2, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
     call clean_up()
     print *, "Error :: in_data2 was changed during arithmetic operations"

From 59ffdc44d7728f41e7fe3539bc2c32c784d555e4 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 6 Dec 2024 11:25:17 +0000
Subject: [PATCH 29/43] Use bare import for autograd

---
 examples/6_Autograd/autograd.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index 67c11b9e..3b0d659b 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -4,8 +4,8 @@ program example
   use, intrinsic :: iso_fortran_env, only : sp => real32
 
   ! Import our library for interfacing with PyTorch's Autograd module
-  use ftorch, only : torch_tensor, torch_kCPU, &
-                     torch_tensor_from_array, torch_tensor_to_array, torch_tensor_delete
+  ! NOTE: Need bare import to get the operator overloading
+  use ftorch
 
   ! Import our tools module for testing utils
   use ftorch_test_utils, only : assert_allclose

From f504f5ecbc479a2fb7ccc81917f2d5e7e4188760 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 6 Dec 2024 11:30:30 +0000
Subject: [PATCH 30/43] Lint

---
 src/ctorch.cpp  |  3 ---
 src/ftorch.f90  | 23 +++++++++++++++++++++++
 src/ftorch.fypp |  8 ++++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 50849b8d..771a4835 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -279,7 +279,6 @@ torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
 
 torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
                                         const torch_tensor_t tensor)
-                                     
 {
     auto t = reinterpret_cast<torch::Tensor* const>(tensor);
     torch::Tensor* output = nullptr;
@@ -290,8 +289,6 @@ torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
 
 torch_tensor_t torch_tensor_postmultiply(const torch_tensor_t tensor,
                                          const torch_data_t scalar)
-                                        
-                                     
 {
     auto t = reinterpret_cast<torch::Tensor* const>(tensor);
     torch::Tensor* output = nullptr;
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index cde1965f..5b8f92b0 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -463,6 +463,7 @@ subroutine torch_tensor_assign(output, input)
       function torch_tensor_assign_c(input_c) result(output_c)                 &
           bind(c, name = 'torch_tensor_assign')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: input_c
         type(c_ptr) :: output_c
       end function torch_tensor_assign_c
@@ -481,6 +482,7 @@ function torch_tensor_add(tensor1, tensor2) result(output)
       function torch_tensor_add_c(tensor1_c, tensor2_c) result(output_c)       &
           bind(c, name = 'torch_tensor_add')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -500,6 +502,7 @@ function torch_tensor_subtract(tensor1, tensor2) result(output)
       function torch_tensor_subtract_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_subtract')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -519,6 +522,7 @@ function torch_tensor_multiply(tensor1, tensor2) result(output)
       function torch_tensor_multiply_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_multiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -539,6 +543,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int8
+        implicit none
         integer(kind=int8), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -559,6 +564,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int16
+        implicit none
         integer(kind=int16), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -579,6 +585,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int32
+        implicit none
         integer(kind=int32), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -599,6 +606,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int64
+        implicit none
         integer(kind=int64), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -619,6 +627,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real32
+        implicit none
         real(kind=real32), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -639,6 +648,7 @@ function torch_tensor_premultiply_c(scalar_c, tensor_c) result(output_c) &
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real64
+        implicit none
         real(kind=real64), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -660,6 +670,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int8
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int8), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -680,6 +691,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int16
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int16), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -700,6 +712,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int32
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int32), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -720,6 +733,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int64
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int64), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -740,6 +754,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real32
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         real(kind=real32), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -760,6 +775,7 @@ function torch_tensor_postmultiply_c(tensor_c, scalar_c)                 &
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real64
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         real(kind=real64), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -779,6 +795,7 @@ function torch_tensor_divide(tensor1, tensor2) result(output)
       function torch_tensor_divide_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_divide')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -799,6 +816,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int8
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int8), value, intent(in) :: power_c
         type(c_ptr) :: output_c
@@ -819,6 +837,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int16
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int16), value, intent(in) :: power_c
         type(c_ptr) :: output_c
@@ -839,6 +858,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int32
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int32), value, intent(in) :: power_c
         type(c_ptr) :: output_c
@@ -859,6 +879,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : int64
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         integer(kind=int64), value, intent(in) :: power_c
         type(c_ptr) :: output_c
@@ -879,6 +900,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real32
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         real(kind=real32), value, intent(in) :: power_c
         type(c_ptr) :: output_c
@@ -899,6 +921,7 @@ function torch_tensor_power_c(tensor_c, power_c) result(output_c)        &
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : real64
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         real(kind=real64), value, intent(in) :: power_c
         type(c_ptr) :: output_c
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 9d0b8214..60b4f880 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -419,6 +419,7 @@ contains
       function torch_tensor_assign_c(input_c) result(output_c)                 &
           bind(c, name = 'torch_tensor_assign')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: input_c
         type(c_ptr) :: output_c
       end function torch_tensor_assign_c
@@ -437,6 +438,7 @@ contains
       function torch_tensor_add_c(tensor1_c, tensor2_c) result(output_c)       &
           bind(c, name = 'torch_tensor_add')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -456,6 +458,7 @@ contains
       function torch_tensor_subtract_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_subtract')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -475,6 +478,7 @@ contains
       function torch_tensor_multiply_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_multiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -496,6 +500,7 @@ contains
           bind(c, name = 'torch_tensor_premultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        implicit none
         ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: scalar_c
         type(c_ptr), value, intent(in) :: tensor_c
         type(c_ptr) :: output_c
@@ -519,6 +524,7 @@ contains
           result(output_c) bind(c, name = 'torch_tensor_postmultiply')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: scalar_c
         type(c_ptr) :: output_c
@@ -539,6 +545,7 @@ contains
       function torch_tensor_divide_c(tensor1_c, tensor2_c) result(output_c)  &
           bind(c, name = 'torch_tensor_divide')
         use, intrinsic :: iso_c_binding, only : c_ptr
+        implicit none
         type(c_ptr), value, intent(in) :: tensor1_c
         type(c_ptr), value, intent(in) :: tensor2_c
         type(c_ptr) :: output_c
@@ -560,6 +567,7 @@ contains
           bind(c, name = 'torch_tensor_power')
         use, intrinsic :: iso_c_binding, only : c_ptr
         use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        implicit none
         type(c_ptr), value, intent(in) :: tensor_c
         ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: power_c
         type(c_ptr) :: output_c

From 8379001e0bce32457e98dc90322c55ed38bc5f05 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 9 Dec 2024 11:44:12 +0000
Subject: [PATCH 31/43] Lint

---
 examples/6_Autograd/autograd.f90 | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index 3b0d659b..0abb1b4c 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -4,8 +4,9 @@ program example
   use, intrinsic :: iso_fortran_env, only : sp => real32
 
   ! Import our library for interfacing with PyTorch's Autograd module
-  ! NOTE: Need bare import to get the operator overloading
-  use ftorch
+  use ftorch, only: assignment(=), operator(+), operator(-), operator(*), &
+    operator(**), torch_kCPU, torch_tensor, torch_tensor_delete, &
+    torch_tensor_from_array, torch_tensor_to_array
 
   ! Import our tools module for testing utils
   use ftorch_test_utils, only : assert_allclose
@@ -31,8 +32,8 @@ program example
   type(torch_tensor) :: a, b, Q
 
   ! Initialise input arrays as in Python example
-  in_data1(:,1) = [2.0, 3.0]
-  in_data2(:,1) = [6.0, 4.0]
+  in_data1(:,1) = [2.0_wp, 3.0_wp]
+  in_data2(:,1) = [6.0_wp, 4.0_wp]
 
   ! Construct a Torch Tensor from a Fortran array
   ! TODO: Implement requires_grad=.true.
@@ -59,7 +60,7 @@ program example
   write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:,1)
 
   ! Check output tensor matches expected value
-  expected(:,1) = [-12.0, 65.0]
+  expected(:,1) = [-12.0_wp, 65.0_wp]
   test_pass = assert_allclose(out_data, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
     call clean_up()
@@ -68,7 +69,7 @@ program example
   end if
 
   ! Check first input array is unchanged by the arithmetic operations
-  expected(:,1) = [2.0, 3.0]
+  expected(:,1) = [2.0_wp, 3.0_wp]
   test_pass = assert_allclose(in_data1, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
     call clean_up()
@@ -77,7 +78,7 @@ program example
   end if
 
   ! Check second input array is unchanged by the arithmetic operations
-  expected(:,1) = [6.0, 4.0]
+  expected(:,1) = [6.0_wp, 4.0_wp]
   test_pass = assert_allclose(in_data2, expected, test_name="torch_tensor_to_array", rtol=1e-5)
   if (.not. test_pass) then
     call clean_up()

From a5909556df723ad6a769e829b17e5530e6c5ea40 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 9 Dec 2024 12:24:29 +0000
Subject: [PATCH 32/43] Apply clang-format

---
 src/ctorch.cpp | 114 +++++++++++++++++++++++--------------------------
 1 file changed, 53 insertions(+), 61 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 771a4835..71d3dcd3 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -234,88 +234,80 @@ void torch_tensor_delete(torch_tensor_t tensor) {
   delete t;
 }
 
-torch_tensor_t torch_tensor_assign(const torch_tensor_t input)
-{
-    auto in = reinterpret_cast<torch::Tensor* const>(input);
-    torch::AutoGradMode enable_grad(in->requires_grad());
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = in->detach().clone();
-    return output;
+torch_tensor_t torch_tensor_assign(const torch_tensor_t input) {
+  auto in = reinterpret_cast<torch::Tensor *const>(input);
+  torch::AutoGradMode enable_grad(in->requires_grad());
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = in->detach().clone();
+  return output;
 }
 
 torch_tensor_t torch_tensor_add(const torch_tensor_t tensor1,
-                                const torch_tensor_t tensor2)
-{
-    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
-    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = *t1 + *t2;
-    return output;
+                                const torch_tensor_t tensor2) {
+  auto t1 = reinterpret_cast<torch::Tensor *const>(tensor1);
+  auto t2 = reinterpret_cast<torch::Tensor *const>(tensor2);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t1 + *t2;
+  return output;
 }
 
 torch_tensor_t torch_tensor_subtract(const torch_tensor_t tensor1,
-                                     const torch_tensor_t tensor2)
-{
-    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
-    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = *t1 - *t2;
-    return output;
+                                     const torch_tensor_t tensor2) {
+  auto t1 = reinterpret_cast<torch::Tensor *const>(tensor1);
+  auto t2 = reinterpret_cast<torch::Tensor *const>(tensor2);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t1 - *t2;
+  return output;
 }
 
 torch_tensor_t torch_tensor_multiply(const torch_tensor_t tensor1,
-                                     const torch_tensor_t tensor2)
-{
-    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
-    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = *t1 * *t2;
-    return output;
+                                     const torch_tensor_t tensor2) {
+  auto t1 = reinterpret_cast<torch::Tensor *const>(tensor1);
+  auto t2 = reinterpret_cast<torch::Tensor *const>(tensor2);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t1 * *t2;
+  return output;
 }
 
 torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
-                                        const torch_tensor_t tensor)
-{
-    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = scalar * *t;
-    return output;
+                                        const torch_tensor_t tensor) {
+  auto t = reinterpret_cast<torch::Tensor *const>(tensor);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = scalar * *t;
+  return output;
 }
 
 torch_tensor_t torch_tensor_postmultiply(const torch_tensor_t tensor,
-                                         const torch_data_t scalar)
-{
-    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = *t * scalar;
-    return output;
+                                         const torch_data_t scalar) {
+  auto t = reinterpret_cast<torch::Tensor *const>(tensor);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t * scalar;
+  return output;
 }
 
 torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
-                                   const torch_tensor_t tensor2)
-{
-    auto t1 = reinterpret_cast<torch::Tensor* const>(tensor1);
-    auto t2 = reinterpret_cast<torch::Tensor* const>(tensor2);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = *t1 / *t2;
-    return output;
+                                   const torch_tensor_t tensor2) {
+  auto t1 = reinterpret_cast<torch::Tensor *const>(tensor1);
+  auto t2 = reinterpret_cast<torch::Tensor *const>(tensor2);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t1 / *t2;
+  return output;
 }
 
 torch_tensor_t torch_tensor_power(const torch_tensor_t tensor,
-                                  const torch_data_t exponent)
-{
-    auto t = reinterpret_cast<torch::Tensor* const>(tensor);
-    torch::Tensor* output = nullptr;
-    output = new torch::Tensor;
-    *output = pow(*t, exponent);
-    return output;
+                                  const torch_data_t exponent) {
+  auto t = reinterpret_cast<torch::Tensor *const>(tensor);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = pow(*t, exponent);
+  return output;
 }
 
 torch_jit_script_module_t

From c12d5c35d7a3a0ab4449331bfc9ed2e669d441a0 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 9 Dec 2024 12:27:55 +0000
Subject: [PATCH 33/43] Apply clang-format to header

---
 src/ctorch.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/ctorch.h b/src/ctorch.h
index ee796ef5..8e8eb971 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -206,7 +206,6 @@ EXPORT_C torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
 EXPORT_C torch_tensor_t torch_tensor_power(const torch_tensor_t tensor,
                                            const torch_data_t exponent);
 
-
 // =====================================================================================
 // Module API
 // =====================================================================================

From d9000e1610406957ec8a81e1d02c0f001a361b7f Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 10 Dec 2024 15:41:51 +0000
Subject: [PATCH 34/43] Post-merge fixes

---
 src/CMakeLists.txt |   2 +-
 src/ftorch.F90     | 360 ++++++++++++++++++++++-----------------------
 src/ftorch.fypp    |  12 +-
 3 files changed, 187 insertions(+), 187 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 848e41e4..2b26818d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -115,11 +115,11 @@ if(CMAKE_BUILD_TESTS)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/1_SimpleNet
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/2_ResNet18
+       DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   if(ENABLE_CUDA)
     file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/3_MultiGPU
          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   endif()
-       DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/4_MultiIO
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   # file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/5_Looping
diff --git a/src/ftorch.F90 b/src/ftorch.F90
index 49be22ce..867e2e00 100644
--- a/src/ftorch.F90
+++ b/src/ftorch.F90
@@ -1089,10 +1089,10 @@ subroutine torch_tensor_from_array_int8_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1143,10 +1143,10 @@ subroutine torch_tensor_from_array_int8_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1197,10 +1197,10 @@ subroutine torch_tensor_from_array_int8_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1251,10 +1251,10 @@ subroutine torch_tensor_from_array_int8_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1305,10 +1305,10 @@ subroutine torch_tensor_from_array_int8_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1359,10 +1359,10 @@ subroutine torch_tensor_from_array_int16_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1413,10 +1413,10 @@ subroutine torch_tensor_from_array_int16_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1467,10 +1467,10 @@ subroutine torch_tensor_from_array_int16_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1521,10 +1521,10 @@ subroutine torch_tensor_from_array_int16_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1575,10 +1575,10 @@ subroutine torch_tensor_from_array_int16_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1629,10 +1629,10 @@ subroutine torch_tensor_from_array_int32_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1683,10 +1683,10 @@ subroutine torch_tensor_from_array_int32_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1737,10 +1737,10 @@ subroutine torch_tensor_from_array_int32_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1791,10 +1791,10 @@ subroutine torch_tensor_from_array_int32_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1845,10 +1845,10 @@ subroutine torch_tensor_from_array_int32_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1899,10 +1899,10 @@ subroutine torch_tensor_from_array_int64_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -1953,10 +1953,10 @@ subroutine torch_tensor_from_array_int64_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2007,10 +2007,10 @@ subroutine torch_tensor_from_array_int64_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2061,10 +2061,10 @@ subroutine torch_tensor_from_array_int64_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2115,10 +2115,10 @@ subroutine torch_tensor_from_array_int64_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2169,10 +2169,10 @@ subroutine torch_tensor_from_array_real32_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2223,10 +2223,10 @@ subroutine torch_tensor_from_array_real32_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2277,10 +2277,10 @@ subroutine torch_tensor_from_array_real32_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2331,10 +2331,10 @@ subroutine torch_tensor_from_array_real32_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2385,10 +2385,10 @@ subroutine torch_tensor_from_array_real32_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2439,10 +2439,10 @@ subroutine torch_tensor_from_array_real64_1d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(1)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
-    integer(c_int64_t)        :: strides(1)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 1                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(1)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
+    integer(c_int64_t)        :: strides(1)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 1                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2493,10 +2493,10 @@ subroutine torch_tensor_from_array_real64_2d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(2)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
-    integer(c_int64_t)        :: strides(2)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 2                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(2)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
+    integer(c_int64_t)        :: strides(2)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 2                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2547,10 +2547,10 @@ subroutine torch_tensor_from_array_real64_3d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(3)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
-    integer(c_int64_t)        :: strides(3)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 3                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(3)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
+    integer(c_int64_t)        :: strides(3)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 3                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2601,10 +2601,10 @@ subroutine torch_tensor_from_array_real64_4d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(4)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
-    integer(c_int64_t)        :: strides(4)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 4                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(4)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
+    integer(c_int64_t)        :: strides(4)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 4                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2655,10 +2655,10 @@ subroutine torch_tensor_from_array_real64_5d(tensor, data_in, layout, &
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(5)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
-    integer(c_int64_t)        :: strides(5)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = 5                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(5)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
+    integer(c_int64_t)        :: strides(5)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = 5                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -2703,7 +2703,7 @@ subroutine torch_tensor_to_array_int8_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2718,7 +2718,7 @@ subroutine torch_tensor_to_array_int8_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int8_1d
@@ -2733,7 +2733,7 @@ subroutine torch_tensor_to_array_int8_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2748,7 +2748,7 @@ subroutine torch_tensor_to_array_int8_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int8_2d
@@ -2763,7 +2763,7 @@ subroutine torch_tensor_to_array_int8_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2778,7 +2778,7 @@ subroutine torch_tensor_to_array_int8_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int8_3d
@@ -2793,7 +2793,7 @@ subroutine torch_tensor_to_array_int8_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2808,7 +2808,7 @@ subroutine torch_tensor_to_array_int8_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int8_4d
@@ -2823,7 +2823,7 @@ subroutine torch_tensor_to_array_int8_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt8  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt8  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2838,7 +2838,7 @@ subroutine torch_tensor_to_array_int8_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int8_5d
@@ -2853,7 +2853,7 @@ subroutine torch_tensor_to_array_int16_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2868,7 +2868,7 @@ subroutine torch_tensor_to_array_int16_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int16_1d
@@ -2883,7 +2883,7 @@ subroutine torch_tensor_to_array_int16_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2898,7 +2898,7 @@ subroutine torch_tensor_to_array_int16_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int16_2d
@@ -2913,7 +2913,7 @@ subroutine torch_tensor_to_array_int16_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2928,7 +2928,7 @@ subroutine torch_tensor_to_array_int16_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int16_3d
@@ -2943,7 +2943,7 @@ subroutine torch_tensor_to_array_int16_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2958,7 +2958,7 @@ subroutine torch_tensor_to_array_int16_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int16_4d
@@ -2973,7 +2973,7 @@ subroutine torch_tensor_to_array_int16_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt16  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt16  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -2988,7 +2988,7 @@ subroutine torch_tensor_to_array_int16_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int16_5d
@@ -3003,7 +3003,7 @@ subroutine torch_tensor_to_array_int32_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3018,7 +3018,7 @@ subroutine torch_tensor_to_array_int32_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int32_1d
@@ -3033,7 +3033,7 @@ subroutine torch_tensor_to_array_int32_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3048,7 +3048,7 @@ subroutine torch_tensor_to_array_int32_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int32_2d
@@ -3063,7 +3063,7 @@ subroutine torch_tensor_to_array_int32_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3078,7 +3078,7 @@ subroutine torch_tensor_to_array_int32_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int32_3d
@@ -3093,7 +3093,7 @@ subroutine torch_tensor_to_array_int32_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3108,7 +3108,7 @@ subroutine torch_tensor_to_array_int32_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int32_4d
@@ -3123,7 +3123,7 @@ subroutine torch_tensor_to_array_int32_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3138,7 +3138,7 @@ subroutine torch_tensor_to_array_int32_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int32_5d
@@ -3153,7 +3153,7 @@ subroutine torch_tensor_to_array_int64_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3168,7 +3168,7 @@ subroutine torch_tensor_to_array_int64_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int64_1d
@@ -3183,7 +3183,7 @@ subroutine torch_tensor_to_array_int64_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3198,7 +3198,7 @@ subroutine torch_tensor_to_array_int64_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int64_2d
@@ -3213,7 +3213,7 @@ subroutine torch_tensor_to_array_int64_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3228,7 +3228,7 @@ subroutine torch_tensor_to_array_int64_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int64_3d
@@ -3243,7 +3243,7 @@ subroutine torch_tensor_to_array_int64_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3258,7 +3258,7 @@ subroutine torch_tensor_to_array_int64_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int64_4d
@@ -3273,7 +3273,7 @@ subroutine torch_tensor_to_array_int64_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kInt64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kInt64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3288,7 +3288,7 @@ subroutine torch_tensor_to_array_int64_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_int64_5d
@@ -3303,7 +3303,7 @@ subroutine torch_tensor_to_array_real32_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3318,7 +3318,7 @@ subroutine torch_tensor_to_array_real32_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real32_1d
@@ -3333,7 +3333,7 @@ subroutine torch_tensor_to_array_real32_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3348,7 +3348,7 @@ subroutine torch_tensor_to_array_real32_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real32_2d
@@ -3363,7 +3363,7 @@ subroutine torch_tensor_to_array_real32_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3378,7 +3378,7 @@ subroutine torch_tensor_to_array_real32_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real32_3d
@@ -3393,7 +3393,7 @@ subroutine torch_tensor_to_array_real32_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3408,7 +3408,7 @@ subroutine torch_tensor_to_array_real32_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real32_4d
@@ -3423,7 +3423,7 @@ subroutine torch_tensor_to_array_real32_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat32  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat32  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3438,7 +3438,7 @@ subroutine torch_tensor_to_array_real32_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real32_5d
@@ -3453,7 +3453,7 @@ subroutine torch_tensor_to_array_real64_1d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3468,7 +3468,7 @@ subroutine torch_tensor_to_array_real64_1d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real64_1d
@@ -3483,7 +3483,7 @@ subroutine torch_tensor_to_array_real64_2d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3498,7 +3498,7 @@ subroutine torch_tensor_to_array_real64_2d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real64_2d
@@ -3513,7 +3513,7 @@ subroutine torch_tensor_to_array_real64_3d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3528,7 +3528,7 @@ subroutine torch_tensor_to_array_real64_3d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real64_3d
@@ -3543,7 +3543,7 @@ subroutine torch_tensor_to_array_real64_4d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3558,7 +3558,7 @@ subroutine torch_tensor_to_array_real64_4d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real64_4d
@@ -3573,7 +3573,7 @@ subroutine torch_tensor_to_array_real64_5d(tensor, data_out, sizes)
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = torch_kFloat64  !! Data type
+    integer(c_int), parameter :: dtype = torch_kFloat64  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -3588,7 +3588,7 @@ subroutine torch_tensor_to_array_real64_5d(tensor, data_out, sizes)
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_real64_5d
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 0d995324..5aa0148b 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -738,10 +738,10 @@ contains
     logical, optional, intent(in)        :: requires_grad  !! Whether gradients need to be computed for the created tensor
 
     ! local data
-    integer(c_int64_t)        :: c_tensor_shape(${RANK}$)            !! Shape of the tensor
-    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$  !! Data type
-    integer(c_int64_t)        :: strides(${RANK}$)                   !! Strides for accessing data
-    integer(c_int), parameter :: ndims = ${RANK}$                    !! Number of dimension of input data
+    integer(c_int64_t)        :: tensor_shape(${RANK}$)            !! Shape of the tensor
+    integer(c_int), parameter :: dtype = ${enum_from_prec(PREC)}$  !! Data type
+    integer(c_int64_t)        :: strides(${RANK}$)                 !! Strides for accessing data
+    integer(c_int), parameter :: ndims = ${RANK}$                  !! Number of dimension of input data
     integer(ftorch_int)       :: i
     integer(c_int)            :: device_index_value
     logical :: requires_grad_value  !! Whether gradients need to be computed for the created tensor
@@ -790,7 +790,7 @@ contains
     integer(kind=int64), allocatable :: my_shape(:)  !! Number of entries for each rank
 
     ! Local data
-    integer(c_int), parameter :: c_dtype = ${enum_from_prec(PREC)}$  !! Data type
+    integer(c_int), parameter :: dtype = ${enum_from_prec(PREC)}$  !! Data type
     type(c_ptr) :: cptr
 
     my_shape = tensor%get_shape()
@@ -805,7 +805,7 @@ contains
     end if
 
     ! Have the data_out array point to the Tensor data
-    cptr = torch_to_blob_c(tensor%p, c_dtype)
+    cptr = torch_to_blob_c(tensor%p, dtype)
     call c_f_pointer(cptr, data_out, my_shape)
 
   end subroutine torch_tensor_to_array_${PREC}$_${RANK}$d

From c87fd9760b829f29bdad3952a49d145cd9a320ec Mon Sep 17 00:00:00 2001
From: tommelt <thomas.meltzer1@gmail.com>
Date: Mon, 16 Dec 2024 12:25:53 +0000
Subject: [PATCH 35/43] test: make windows CI more robust

Previously I hard-coded the path to libtorch. This commit replaces the
hard-code path using `pip show` to get the updated location.
---
 .github/workflows/test_suite_windows.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test_suite_windows.yml b/.github/workflows/test_suite_windows.yml
index 3cfb2321..a667e8a5 100644
--- a/.github/workflows/test_suite_windows.yml
+++ b/.github/workflows/test_suite_windows.yml
@@ -69,11 +69,13 @@ jobs:
         shell: cmd
         run: |
           cd src
+          rem find torch location
+          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set package_path=%%i
           cmake ^
             -Bbuild ^
             -G "NMake Makefiles" ^
             -DCMAKE_Fortran_FLAGS="/fpscomp:logicals" ^
-            -DCMAKE_PREFIX_PATH="C:\hostedtoolcache\windows\Python\3.12.7\x64\Lib\site-packages" ^
+            -DCMAKE_PREFIX_PATH=%package_path% ^
             -DCMAKE_BUILD_TYPE=Release ^
             -DCMAKE_Fortran_COMPILER=ifx ^
             -DCMAKE_C_COMPILER=icx ^
@@ -85,7 +87,7 @@ jobs:
       - name: Integration tests
         shell: cmd
         run: |
-          set PATH=C:\hostedtoolcache\windows\Python\3.12.7\x64\Lib\site-packages;%PATH%
+          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set package_path=%%i
           set PATH=C:\Program Files (x86)\FTorch\bin;%PATH%
-          set PATH=C:\hostedtoolcache\windows\Python\3.12.7\x64\Lib\site-packages\torch\lib;%PATH%
+          set PATH=%package_path%\torch\lib;%PATH%
           run_integration_tests.bat

From 98917d38c807e253ecd9662000316599a9be7b87 Mon Sep 17 00:00:00 2001
From: tommelt <thomas.meltzer1@gmail.com>
Date: Mon, 16 Dec 2024 22:09:59 +0000
Subject: [PATCH 36/43] chore: rename variable to torch_path

---
 .github/workflows/test_suite_windows.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test_suite_windows.yml b/.github/workflows/test_suite_windows.yml
index a667e8a5..d96712dd 100644
--- a/.github/workflows/test_suite_windows.yml
+++ b/.github/workflows/test_suite_windows.yml
@@ -70,12 +70,12 @@ jobs:
         run: |
           cd src
           rem find torch location
-          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set package_path=%%i
+          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set torch_path=%%i
           cmake ^
             -Bbuild ^
             -G "NMake Makefiles" ^
             -DCMAKE_Fortran_FLAGS="/fpscomp:logicals" ^
-            -DCMAKE_PREFIX_PATH=%package_path% ^
+            -DCMAKE_PREFIX_PATH=%torch_path% ^
             -DCMAKE_BUILD_TYPE=Release ^
             -DCMAKE_Fortran_COMPILER=ifx ^
             -DCMAKE_C_COMPILER=icx ^
@@ -87,7 +87,7 @@ jobs:
       - name: Integration tests
         shell: cmd
         run: |
-          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set package_path=%%i
+          for /f "tokens=2*" %%i in ('pip show torch ^| findstr /R "^Location"') do set torch_path=%%i
           set PATH=C:\Program Files (x86)\FTorch\bin;%PATH%
-          set PATH=%package_path%\torch\lib;%PATH%
+          set PATH=%torch_path%\torch\lib;%PATH%
           run_integration_tests.bat

From f0ed978de73d86b4167daa8af2ddd9ef3563e0b2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 18 Dec 2024 09:41:45 +0000
Subject: [PATCH 37/43] Reformulate autograd example to test multiply and
 divide

---
 examples/6_Autograd/autograd.f90 | 4 ++--
 examples/6_Autograd/autograd.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index 0abb1b4c..88b7776e 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -53,11 +53,11 @@ program example
   ! Check arithmetic operations work for torch_tensors
   write (*,*) "a = ", in_data1(:,1)
   write (*,*) "b = ", in_data2(:,1)
-  Q = 3 * a ** 3 - b ** 2
+  Q = 3 * (a**3 - b * b / 3)
 
   ! Extract a Fortran array from a Torch tensor
   call torch_tensor_to_array(Q, out_data, shape(in_data1))
-  write (*,*) "Q = 3 * a ** 3 - b ** 2 =", out_data(:,1)
+  write (*,*) "Q = 3 * (a ** 3 - b * b / 2) =", out_data(:,1)
 
   ! Check output tensor matches expected value
   expected(:,1) = [-12.0_wp, 65.0_wp]
diff --git a/examples/6_Autograd/autograd.py b/examples/6_Autograd/autograd.py
index cc7ee753..9fdd816c 100644
--- a/examples/6_Autograd/autograd.py
+++ b/examples/6_Autograd/autograd.py
@@ -5,7 +5,7 @@
 a = torch.tensor([2.0, 3.0], requires_grad=True)
 b = torch.tensor([6.0, 4.0], requires_grad=True)
 
-Q = 3 * a**3 - b**2
+Q = 3 * (a**3 - b * b / 3)
 print(Q)
 expect = torch.tensor([-12.0, 65.0])
 if not torch.allclose(Q, expect):

From b804d54335989820846326fce92ac633fd1c2b03 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 18 Dec 2024 09:47:38 +0000
Subject: [PATCH 38/43] Implement postdivide

---
 examples/6_Autograd/autograd.f90 |   2 +-
 src/ctorch.cpp                   |   9 +++
 src/ctorch.h                     |  11 ++-
 src/ftorch.F90                   | 133 +++++++++++++++++++++++++++++++
 src/ftorch.fypp                  |  27 +++++++
 5 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/examples/6_Autograd/autograd.f90 b/examples/6_Autograd/autograd.f90
index 88b7776e..6b95c6fb 100644
--- a/examples/6_Autograd/autograd.f90
+++ b/examples/6_Autograd/autograd.f90
@@ -5,7 +5,7 @@ program example
 
   ! Import our library for interfacing with PyTorch's Autograd module
   use ftorch, only: assignment(=), operator(+), operator(-), operator(*), &
-    operator(**), torch_kCPU, torch_tensor, torch_tensor_delete, &
+    operator(/), operator(**), torch_kCPU, torch_tensor, torch_tensor_delete, &
     torch_tensor_from_array, torch_tensor_to_array
 
   ! Import our tools module for testing utils
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index b06fe12a..d64443fd 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -300,6 +300,15 @@ torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
   return output;
 }
 
+torch_tensor_t torch_tensor_postdivide(const torch_tensor_t tensor,
+                                       const torch_data_t scalar) {
+  auto t = reinterpret_cast<torch::Tensor *const>(tensor);
+  torch::Tensor *output = nullptr;
+  output = new torch::Tensor;
+  *output = *t / scalar;
+  return output;
+}
+
 torch_tensor_t torch_tensor_power(const torch_tensor_t tensor,
                                   const torch_data_t exponent) {
   auto t = reinterpret_cast<torch::Tensor *const>(tensor);
diff --git a/src/ctorch.h b/src/ctorch.h
index e9ddab39..a4a6310a 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -180,7 +180,7 @@ EXPORT_C torch_tensor_t torch_tensor_premultiply(const torch_data_t scalar,
                                                  const torch_tensor_t tensor);
 
 /**
- * Overloads the postmultiplication operator for a scalar and a Torch Tensor
+ * Overloads the postmultiplication operator for a Torch Tensor and a scalar
  * @param Tensor to be multiplied
  * @param scalar to multiply by
  * @return product of the Tensor and scalar
@@ -197,6 +197,15 @@ EXPORT_C torch_tensor_t torch_tensor_postmultiply(const torch_tensor_t tensor,
 EXPORT_C torch_tensor_t torch_tensor_divide(const torch_tensor_t tensor1,
                                             const torch_tensor_t tensor2);
 
+/**
+ * Overloads the post-division operator for a Torch Tensor and a scalar
+ * @param Tensor to be divided
+ * @param scalar to divide by
+ * @return quotient of the Tensor and scalar
+ */
+EXPORT_C torch_tensor_t torch_tensor_postdivide(const torch_tensor_t tensor,
+                                                const torch_data_t scalar);
+
 /**
  * Overloads the exponentiation operator for two Torch Tensors
  * @param Tensor to take the power of
diff --git a/src/ftorch.F90 b/src/ftorch.F90
index 867e2e00..80884d56 100644
--- a/src/ftorch.F90
+++ b/src/ftorch.F90
@@ -178,6 +178,12 @@ end function torch_from_blob_c
 
   interface operator (/)
     module procedure torch_tensor_divide
+    module procedure torch_tensor_postdivide_int8
+    module procedure torch_tensor_postdivide_int16
+    module procedure torch_tensor_postdivide_int32
+    module procedure torch_tensor_postdivide_int64
+    module procedure torch_tensor_postdivide_real32
+    module procedure torch_tensor_postdivide_real64
   end interface
 
   interface operator (**)
@@ -811,6 +817,133 @@ end function torch_tensor_divide_c
     output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
   end function torch_tensor_divide
 
+  !> Overloads division operator for a tensor and a scalar of type int8.
+  function torch_tensor_postdivide_int8(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int8), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int8
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int8), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_int8
+
+  !> Overloads division operator for a tensor and a scalar of type int16.
+  function torch_tensor_postdivide_int16(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int16), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int16
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int16), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_int16
+
+  !> Overloads division operator for a tensor and a scalar of type int32.
+  function torch_tensor_postdivide_int32(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int32), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int32
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int32), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_int32
+
+  !> Overloads division operator for a tensor and a scalar of type int64.
+  function torch_tensor_postdivide_int64(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    integer(kind=int64), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : int64
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        integer(kind=int64), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_int64
+
+  !> Overloads division operator for a tensor and a scalar of type real32.
+  function torch_tensor_postdivide_real32(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real32), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real32
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real32), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_real32
+
+  !> Overloads division operator for a tensor and a scalar of type real64.
+  function torch_tensor_postdivide_real64(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    real(kind=real64), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : real64
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        real(kind=real64), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_real64
+
+
   !> Overloads exponentiation operator for a tensor and a scalar of type `int8`
   function torch_tensor_power_int8(tensor, power) result(output)
     type(torch_tensor), intent(in) :: tensor
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 5aa0148b..db02e2b6 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -137,6 +137,9 @@ module ftorch
 
   interface operator (/)
     module procedure torch_tensor_divide
+    #:for PREC in PRECISIONS
+    module procedure torch_tensor_postdivide_${PREC}$
+    #:endfor
   end interface
 
   interface operator (**)
@@ -561,6 +564,30 @@ contains
     output%p = torch_tensor_divide_c(tensor1%p, tensor2%p)
   end function torch_tensor_divide
 
+  #:for PREC in PRECISIONS
+  !> Overloads division operator for a tensor and a scalar of type ${PREC}$.
+  function torch_tensor_postdivide_${PREC}$(tensor, scalar) result(output)
+    type(torch_tensor), intent(in) :: tensor
+    ${f_type(PREC)}$(kind=${PREC}$), intent(in) :: scalar
+    type(torch_tensor) :: output
+
+    interface
+      function torch_tensor_postdivide_c(tensor_c, scalar_c)                 &
+          result(output_c) bind(c, name = 'torch_tensor_postdivide')
+        use, intrinsic :: iso_c_binding, only : c_ptr
+        use, intrinsic :: iso_fortran_env, only : ${PREC}$
+        implicit none
+        type(c_ptr), value, intent(in) :: tensor_c
+        ${f_type(PREC)}$(kind=${PREC}$), value, intent(in) :: scalar_c
+        type(c_ptr) :: output_c
+      end function torch_tensor_postdivide_c
+    end interface
+
+    output%p = torch_tensor_postdivide_c(tensor%p, scalar)
+  end function torch_tensor_postdivide_${PREC}$
+
+  #:endfor
+
   #:for PREC in PRECISIONS
   !> Overloads exponentiation operator for a tensor and a scalar of type `${PREC}$`
   function torch_tensor_power_${PREC}$(tensor, power) result(output)

From c0a3fe49c73f130cf51ab6438add5469c93027b2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 18 Dec 2024 10:19:12 +0000
Subject: [PATCH 39/43] Point to Torch C++ API

---
 pages/developer.md | 8 ++++++++
 src/ctorch.cpp     | 5 +++++
 2 files changed, 13 insertions(+)

diff --git a/pages/developer.md b/pages/developer.md
index 826871be..8e2f9c54 100644
--- a/pages/developer.md
+++ b/pages/developer.md
@@ -77,6 +77,14 @@ and many of our users wish to _"clone-and-go"_ rather than develop, we provide b
 Development should only take place in `ftorch.fypp`, however._
 
 
+### Torch C++ API
+
+When extending or modifying functionality related to C++ header and/or source
+files `src/ctorch.h` and `src/ctorch.cpp`, we refer to the Torch C++
+[API documentation](https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#exhale-class-classat-1-1-tensor)
+page on the PyTorch website for details.
+
+
 ### git hook
 
 In order to streamline the process of uploading we provide a pre-commit hook in
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index d64443fd..cf0b73b2 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -1,3 +1,8 @@
+/*
+ * See
+ * https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#exhale-class-classat-1-1-tensor
+ * for more details on the Torch Tensor C++ API.
+ */
 #include <torch/script.h>
 #include <torch/torch.h>
 

From 1e852b41282c8423722917d6997c74de095e35b8 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 18 Dec 2024 10:37:07 +0000
Subject: [PATCH 40/43] Update docs on autograd; add reference to looping
 example

---
 pages/autograd.md | 42 ++++++++++++++++++++++++++++++++++++++++++
 pages/examples.md | 11 +++++++++--
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 pages/autograd.md

diff --git a/pages/autograd.md b/pages/autograd.md
new file mode 100644
index 00000000..39e33fca
--- /dev/null
+++ b/pages/autograd.md
@@ -0,0 +1,42 @@
+title: Online training
+
+[TOC]
+
+## Current state
+
+FTorch has supported offline training of ML models for some time. We are
+currently working on extending its functionality to support online training,
+too. This will involve exposing the automatic differentiation and
+back-propagation functionality in PyTorch/libtorch.
+
+In the following, we document a workplan of the related functionality. Each step
+below will be updated upon completion.
+
+### Operator overloading
+
+Mathematical operators involving Tensors are overloaded, so that we can compute
+expressions involving outputs from one or more ML models.
+
+Whilst it's possible to import such functionality with a bare
+```fortran
+use ftorch
+```
+statement, the best practice is to import specifically the operators that you
+wish to use. Note that the assignment operator `=` has a slightly different
+notation:
+```
+use ftorch, only: assignment(=), operator(+), operator(-), operator(*), &
+  operator(/), operator(**)
+```
+
+For a concrete example of how to compute mathematical expressions involving
+Torch tensors, see the associated
+[worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/6_Autograd).
+
+### The `requires_grad` property
+
+*Not yet implemented.*
+
+### The `backward` operator
+
+*Not yet implemented.*
diff --git a/pages/examples.md b/pages/examples.md
index 3e7130d9..dcbaf9d4 100644
--- a/pages/examples.md
+++ b/pages/examples.md
@@ -187,9 +187,16 @@ data to multiple GPU devices.
 considers a variant of the SimpleNet demo, which demonstrates how to account for
 multiple input tensors and multiple output tensors.
 
+#### 5) Looping
+
+[This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/5_Looping)
+demonstrates best practices for performing inference on the same network with
+different input multiple times in the same workflow.
+
 #### 6) Autograd
 
-[This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/5_Autograd)
+[This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/6_Autograd)
 is currently under development. Eventually, it will demonstrate how to perform
 automatic differentiation in FTorch by leveraging PyTorch's Autograd module.
-Currently, it just demonstrates how to use `torch_tensor_to_array`.
+Currently, it just demonstrates how to use `torch_tensor_to_array` and compute
+mathematical expressions involving Torch tensors.

From 8dfa92c4709e534a7507cd68bba7c8f26d20608f Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 19 Dec 2024 09:20:10 +0000
Subject: [PATCH 41/43] Revert adding example 3 to build

---
 examples/CMakeLists.txt | 4 +---
 src/CMakeLists.txt      | 6 ++----
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a2ab317e..0bafb40e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,9 +1,7 @@
 if(CMAKE_BUILD_TESTS)
   add_subdirectory(1_SimpleNet)
   add_subdirectory(2_ResNet18)
-  if(ENABLE_CUDA)
-    add_subdirectory(3_MultiGPU)
-  endif()
+  # add_subdirectory(3_MultiGPU)
   add_subdirectory(4_MultiIO)
   # add_subdirectory(5_Looping)
   add_subdirectory(6_Autograd)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 2b26818d..6c00aceb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -116,10 +116,8 @@ if(CMAKE_BUILD_TESTS)
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/2_ResNet18
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
-  if(ENABLE_CUDA)
-    file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/3_MultiGPU
-         DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
-  endif()
+  # file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/3_MultiGPU
+  #      DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/4_MultiIO
        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/test/examples)
   # file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/../examples/5_Looping

From 4406fd86c4cbe0c6b8f1093efd18081c409c3b49 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <22053413+jwallwork23@users.noreply.github.com>
Date: Fri, 20 Dec 2024 14:56:38 +0000
Subject: [PATCH 42/43] Write as LibTorch in pages/autograd.md

Co-authored-by: Jack Atkinson <109271713+jatkinson1000@users.noreply.github.com>
---
 pages/autograd.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pages/autograd.md b/pages/autograd.md
index 39e33fca..92fce98e 100644
--- a/pages/autograd.md
+++ b/pages/autograd.md
@@ -7,7 +7,7 @@ title: Online training
 FTorch has supported offline training of ML models for some time. We are
 currently working on extending its functionality to support online training,
 too. This will involve exposing the automatic differentiation and
-back-propagation functionality in PyTorch/libtorch.
+back-propagation functionality in PyTorch/LibTorch.
 
 In the following, we document a workplan of the related functionality. Each step
 below will be updated upon completion.

From e82a9dbf3fb9090e64cb47402ed206b9a924dc75 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 20 Dec 2024 15:01:46 +0000
Subject: [PATCH 43/43] Use better links for Torch C++ docs

---
 pages/developer.md | 7 ++++---
 src/ctorch.cpp     | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/pages/developer.md b/pages/developer.md
index 73f90d2b..7b3f710c 100644
--- a/pages/developer.md
+++ b/pages/developer.md
@@ -80,9 +80,10 @@ Development should only take place in `ftorch.fypp`, however._
 ### Torch C++ API
 
 When extending or modifying functionality related to C++ header and/or source
-files `src/ctorch.h` and `src/ctorch.cpp`, we refer to the Torch C++
-[API documentation](https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#exhale-class-classat-1-1-tensor)
-page on the PyTorch website for details.
+files `src/ctorch.h` and `src/ctorch.cpp`, we refer to the Torch
+[C++ documentation](https://pytorch.org/cppdocs) and more specifically the
+[C++ API documentation](https://pytorch.org/cppdocs/api/library_root.html)
+pages on the PyTorch website for details.
 
 
 ### git hook
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index cf0b73b2..50f77a39 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -1,7 +1,7 @@
 /*
- * See
- * https://pytorch.org/cppdocs/api/classat_1_1_tensor.html#exhale-class-classat-1-1-tensor
- * for more details on the Torch Tensor C++ API.
+ * For more details on the Torch Tensor C++ API, we refer to the Torch C++ documentation
+ * (https://pytorch.org/cppdocs) and more specifically the C++ API documentation
+ * (https://pytorch.org/cppdocs/api/library_root.html) pages on the PyTorch website.
  */
 #include <torch/script.h>
 #include <torch/torch.h>