From 18fda7c4c12e10c0839ac59d83b4475417413cec Mon Sep 17 00:00:00 2001
From: ahueck <hueck.alexander@gmail.com>
Date: Fri, 1 Nov 2024 13:33:58 +0100
Subject: [PATCH] Release v0.2 (#2)

* Enable Clang/LLVM 18 support (#1)
* Added support for
	* cudaMemcpy2D and async variant,
    * cudaMemset2DAsync and async variant,
	* cudaStreamCreateWithPriority and cudaEventCreateWithFlags
* Refactored test setup, static pass and runtime distinction
* TypeART as optional dependency, works with LLVM 14
---------
Co-authored-by: Tim Ziegler <timziegler1604@gmail.com>
---
 README.md                                     |  94 ++-
 cmake/cusanToolchain.cmake                    |  12 +-
 cmake/modules/cusan-llvm.cmake                |   2 +-
 externals/CMakeLists.txt                      |  12 +-
 lib/analysis/CMakeLists.txt                   |   2 +
 lib/analysis/KernelAnalysis.cpp               | 185 +++---
 lib/analysis/KernelModel.cpp                  |  24 +-
 lib/analysis/KernelModel.h                    |  15 +-
 lib/analysis/ModelIO.cpp                      |   9 +-
 lib/pass/AnalysisTransform.cpp                | 624 ++++++++++++++++++
 lib/pass/AnalysisTransform.h                  | 518 ++-------------
 lib/pass/CMakeLists.txt                       |   6 +
 lib/pass/CusanPass.cpp                        |  68 +-
 lib/pass/FunctionDecl.cpp                     | 129 ++++
 lib/pass/FunctionDecl.h                       | 115 +---
 lib/runtime/CMakeLists.txt                    |  11 +-
 lib/runtime/CusanRuntime.cpp                  | 471 ++++++++-----
 lib/runtime/CusanRuntime.h                    |  11 +-
 lib/runtime/MPIInterception.cpp               |   2 +-
 lib/runtime/TSanInterface.h                   |   7 +
 lib/runtime/TSan_External.h                   |  10 +-
 lib/support/Util.h                            |  36 +-
 scripts/CMakeLists.txt                        | 152 +++--
 scripts/cusan-tmpl.sh.in                      | 106 ++-
 scripts/cusan-wrapper.in                      | 465 ++++++-------
 test/CMakeLists.txt                           |  35 +-
 test/kernel_analysis/01_ptr_write.c           |  58 ++
 .../02_val_readwrite_ptr_read_ptr_write.c     |  47 ++
 test/kernel_analysis/03_struct_write.c        | 101 +++
 test/kernel_analysis/04_struct_ptr.c          | 108 +++
 .../05_struct_inside_of_struct.c              |  86 +++
 test/kernel_analysis/06_cuda_labmda.c         |  59 ++
 test/kernel_analysis/07_negative_array.c      |  57 ++
 test/kernel_analysis/08_big_struct_write.c    | 104 +++
 test/lit.cfg                                  |   6 +-
 test/pass/01_test.c                           |  26 +-
 test/pass/02_event.c                          |  18 +-
 test/pass/03_cuda_to_mpi.c                    |  19 +-
 test/pass/04_mpi_to_cuda.c                    |  23 +-
 test/pass/05_cuda_to_mpi_stream.c             |  20 +-
 test/pass/06_cuda_to_mpi_event.c              |  23 +-
 test/pass/07_cuda_to_mpi_read.c               |  17 +-
 test/pass/08_cudamemcpy_to_mpi.c              |  24 +-
 test/pass/09_cudamemcpy_default.c             |  23 +-
 test/pass/10_cudahostalloc.c                  |  27 +-
 test/pass/11_cuda_to_mpi_struct_of_buff.c     |  49 +-
 test/pass/11_struct_of_buff.c                 |  49 +-
 test/pass/12_struct_ptr.c                     |  37 +-
 test/pass/13_struct_recursion.c               |  40 +-
 test/pass/14_cuda_functor.c                   |  25 +-
 test/pass/15_cuda_memset_sync.c               |  25 +-
 test/pass/15_cuda_memset_sync_nonblocking.c   |  28 +-
 test/pass/17_cuda_stream_query_busy_loop.c    |  19 +-
 test/pass/18_cuda_event_query_busy_loop.c     |  23 +-
 ...18_cuda_event_with_flags_query_busy_loop.c |  67 ++
 .../18_cuda_to_mpi_event_query_busy_loop.c    |  24 +-
 ...19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c |  27 +-
 ...MemcpyAsyncH2H_implicit_sync_nonblocking.c |  28 +-
 ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c |  23 +-
 test/pass/20_cuda_default_stream_sync.c       |  22 +-
 test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c   |  20 +-
 test/pass/21_chunked_streams_example.c        |  26 +-
 test/pass/22_cuda_to_mpi_partial_buff_write.c |  21 +-
 test/pass/23_cuda_default_stream_post_sync.c  |  21 +-
 ...uda_default_stream_post_sync_nonblocking.c |  23 +-
 .../24_cuda_sync_stream_default_nonblocking.c |  19 +-
 .../pass/25_cuda_default_stream_double_sync.c |  22 +-
 test/pass/26_malloc_pitch.c                   |  86 +++
 test/pass/28_cuda_memset2d_implicit_syn.c     | 103 +++
 test/pass/29_tsan_cuda_to_mpi.c               |  81 +++
 test/pass/30_tsan_annotate_cuda_to_mpi.c      |  83 +++
 test/pass/31_tsan_cuda_event.c                |  60 ++
 test/pass/32_tsan_async_copy.c                |  77 +++
 test/pass/33_tsan_wait_event.c                | 101 +++
 test/pass/TSan_External.h                     |  20 +-
 test/runtime/02_event.c                       |  63 ++
 test/runtime/03_cuda_to_mpi.c                 |  83 +++
 test/runtime/04_mpi_to_cuda.c                 | 109 +++
 test/runtime/05_cuda_to_mpi_stream.c          |  88 +++
 test/runtime/06_cuda_to_mpi_event.c           |  87 +++
 test/runtime/07_cuda_to_mpi_read.c            |  94 +++
 test/runtime/08_cudamemcpy_to_mpi.c           |  71 ++
 test/runtime/09_cudamemcpy_default.c          |  33 +
 test/runtime/10_cudahostalloc.c               |  21 +
 test/runtime/11_cuda_to_mpi_struct_of_buff.c  |  91 +++
 test/runtime/11_struct_of_buff.c              |  69 ++
 test/runtime/12_struct_ptr.c                  |  98 +++
 test/runtime/13_struct_recursion.c            |  73 ++
 test/runtime/14_cuda_functor.c                |  64 ++
 test/runtime/15_cuda_memset_sync.c            |  69 ++
 .../runtime/15_cuda_memset_sync_nonblocking.c |  69 ++
 .../16_cuda_hostalloc_implicit_sync.c         |   4 +-
 .../16_cuda_malloc_implicit_sync.c            |   4 +-
 test/runtime/17_cuda_stream_query_busy_loop.c |  61 ++
 test/runtime/18_cuda_event_query_busy_loop.c  |  64 ++
 ...18_cuda_event_with_flags_query_busy_loop.c |  64 ++
 .../18_cuda_to_mpi_event_query_busy_loop.c    |  86 +++
 ...19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c |  73 ++
 ...MemcpyAsyncH2H_implicit_sync_nonblocking.c |  74 +++
 ...pi_send_cudaMemcpyAsyncH2H_implicit_sync.c |  91 +++
 test/runtime/20_cuda_default_stream_sync.c    |  67 ++
 .../runtime/20_cuda_to_mpi_send_ds_sync_w_r.c |  89 +++
 test/runtime/21_chunked_streams_example.c     | 114 ++++
 .../22_cuda_to_mpi_partial_buff_write.c       |  83 +++
 .../23_cuda_default_stream_post_sync.c        |  67 ++
 ...uda_default_stream_post_sync_nonblocking.c |  67 ++
 .../24_cuda_sync_stream_default_nonblocking.c |  63 ++
 .../25_cuda_default_stream_double_sync.c      |  68 ++
 test/runtime/26_malloc_pitch.c                |  93 +++
 test/runtime/27_cuda_memcpy2d_implicit_syn.c  |  87 +++
 test/runtime/28_cuda_memset2d_implicit_syn.c  |  91 +++
 .../29_tsan_cuda_to_mpi.c}                    |   7 +-
 .../30_tsan_annotate_cuda_to_mpi.c}           |   6 +-
 .../31_tsan_cuda_event.c}                     |  15 +-
 .../32_tsan_async_copy.c}                     |  16 +-
 .../33_tsan_wait_event.c}                     |  22 +-
 test/runtime/34_negative_array.c              |  63 ++
 test/runtime/35_struct_null.c                 |  61 ++
 test/{tsan => runtime}/TSan_External.h        |   0
 test/{tsan => runtime}/suppressions.txt       |   2 +-
 120 files changed, 6215 insertions(+), 1785 deletions(-)
 create mode 100644 lib/pass/AnalysisTransform.cpp
 create mode 100644 lib/pass/FunctionDecl.cpp
 create mode 100644 lib/runtime/TSanInterface.h
 create mode 100644 test/kernel_analysis/01_ptr_write.c
 create mode 100644 test/kernel_analysis/02_val_readwrite_ptr_read_ptr_write.c
 create mode 100644 test/kernel_analysis/03_struct_write.c
 create mode 100644 test/kernel_analysis/04_struct_ptr.c
 create mode 100644 test/kernel_analysis/05_struct_inside_of_struct.c
 create mode 100644 test/kernel_analysis/06_cuda_labmda.c
 create mode 100644 test/kernel_analysis/07_negative_array.c
 create mode 100644 test/kernel_analysis/08_big_struct_write.c
 create mode 100644 test/pass/18_cuda_event_with_flags_query_busy_loop.c
 create mode 100644 test/pass/26_malloc_pitch.c
 create mode 100644 test/pass/28_cuda_memset2d_implicit_syn.c
 create mode 100644 test/pass/29_tsan_cuda_to_mpi.c
 create mode 100644 test/pass/30_tsan_annotate_cuda_to_mpi.c
 create mode 100644 test/pass/31_tsan_cuda_event.c
 create mode 100644 test/pass/32_tsan_async_copy.c
 create mode 100644 test/pass/33_tsan_wait_event.c
 create mode 100644 test/runtime/02_event.c
 create mode 100644 test/runtime/03_cuda_to_mpi.c
 create mode 100644 test/runtime/04_mpi_to_cuda.c
 create mode 100644 test/runtime/05_cuda_to_mpi_stream.c
 create mode 100644 test/runtime/06_cuda_to_mpi_event.c
 create mode 100644 test/runtime/07_cuda_to_mpi_read.c
 create mode 100644 test/runtime/08_cudamemcpy_to_mpi.c
 create mode 100644 test/runtime/09_cudamemcpy_default.c
 create mode 100644 test/runtime/10_cudahostalloc.c
 create mode 100644 test/runtime/11_cuda_to_mpi_struct_of_buff.c
 create mode 100644 test/runtime/11_struct_of_buff.c
 create mode 100644 test/runtime/12_struct_ptr.c
 create mode 100644 test/runtime/13_struct_recursion.c
 create mode 100644 test/runtime/14_cuda_functor.c
 create mode 100644 test/runtime/15_cuda_memset_sync.c
 create mode 100644 test/runtime/15_cuda_memset_sync_nonblocking.c
 rename test/{pass => runtime}/16_cuda_hostalloc_implicit_sync.c (87%)
 rename test/{pass => runtime}/16_cuda_malloc_implicit_sync.c (87%)
 create mode 100644 test/runtime/17_cuda_stream_query_busy_loop.c
 create mode 100644 test/runtime/18_cuda_event_query_busy_loop.c
 create mode 100644 test/runtime/18_cuda_event_with_flags_query_busy_loop.c
 create mode 100644 test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
 create mode 100644 test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
 create mode 100644 test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
 create mode 100644 test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
 create mode 100644 test/runtime/20_cuda_default_stream_sync.c
 create mode 100644 test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
 create mode 100644 test/runtime/21_chunked_streams_example.c
 create mode 100644 test/runtime/22_cuda_to_mpi_partial_buff_write.c
 create mode 100644 test/runtime/23_cuda_default_stream_post_sync.c
 create mode 100644 test/runtime/23_cuda_default_stream_post_sync_nonblocking.c
 create mode 100644 test/runtime/24_cuda_sync_stream_default_nonblocking.c
 create mode 100644 test/runtime/25_cuda_default_stream_double_sync.c
 create mode 100644 test/runtime/26_malloc_pitch.c
 create mode 100644 test/runtime/27_cuda_memcpy2d_implicit_syn.c
 create mode 100644 test/runtime/28_cuda_memset2d_implicit_syn.c
 rename test/{tsan/01_tsan_cuda_to_mpi.c => runtime/29_tsan_cuda_to_mpi.c} (86%)
 rename test/{tsan/02_tsan_annotate_cuda_to_mpi.c => runtime/30_tsan_annotate_cuda_to_mpi.c} (87%)
 rename test/{tsan/03_tsan_cuda_event.c => runtime/31_tsan_cuda_event.c} (63%)
 rename test/{tsan/04_tsan_async_copy.c => runtime/32_tsan_async_copy.c} (64%)
 rename test/{tsan/05_tsan_wait_event.c => runtime/33_tsan_wait_event.c} (63%)
 create mode 100644 test/runtime/34_negative_array.c
 create mode 100644 test/runtime/35_struct_null.c
 rename test/{tsan => runtime}/TSan_External.h (100%)
 rename test/{tsan => runtime}/suppressions.txt (95%)

diff --git a/README.md b/README.md
index c4840a6..f75f3d2 100644
--- a/README.md
+++ b/README.md
@@ -1,73 +1,103 @@
 # CuSan  &middot; [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
 
 CuSan is tool to find data races between (asynchronous) CUDA calls and the host.
-To that end, we analyze and instrument CUDA codes to track CUDA domain-specific memory accesses and synchronization semantics during compilation using LLVM.
-Our runtime then passes these information appropriately to [ThreadSanitizer](https://clang.llvm.org/docs/ThreadSanitizer.html) (packaged with Clang/LLVM) for the final data race analysis.
+
+To that end, during compilation with Clang/LLVM, we analyze and instrument CUDA API usage in the target code to track CUDA-specific memory accesses and synchronization semantics.
+Our runtime then exposes these information to [ThreadSanitizer](https://clang.llvm.org/docs/ThreadSanitizer.html) (packaged with Clang/LLVM) for the final data race analysis.
 
 
 ## Usage
 
 Making use of CuSan consists of two phases:
 
-1. Compile your code with Clang/LLVM (version 14) using one the CuSan compiler wrappers, e.g., `cusan-clang++` or `cusan-mpic++`.
-This will (a) analyze and instrument the CUDA API appropriately, such as kernel calls and their particular memory access semantics (r/w), (b) add ThreadSanitizer instrumentation, and (c) finally link our runtime library.
+1. Compile your code using one the CuSan compiler wrappers, e.g., `cusan-clang++` or `cusan-mpic++`.
+This will (a) analyze and instrument the CUDA API, such as kernel calls and their particular memory access semantics (r/w), (b) add ThreadSanitizer instrumentation automatically (`-fsanitize=thread`), and (c) finally link our runtime library.
 2. Execute the target program for the data race analysis. Our runtime internally calls ThreadSanitizer to expose the CUDA synchronization and memory access semantics. 
 
+#### Example usage
+Given the file [02_event.c](test/runtime/02_event.c), execute the following for CUDA data race detection:
+
+```bash
+$ cusan-clang -O3 -g 02_event.c -x cuda -gencode arch=compute_70,code=sm_70 -o event.exe
+$ export TSAN_OPTIONS=ignore_noninstrumented_modules=1
+$ ./event.exe
+```
 
 ### Checking CUDA-aware MPI applications
 You need to use the MPI correctness checker [MUST](https://hpc.rwth-aachen.de/must/), or preload our (very) simple MPI interceptor `libCusanMPIInterceptor.so` for CUDA-aware MPI data race detection.
 These libraries call ThreadSanitizer with the particular access semantics of MPI. 
 Therefore, the combined semantics of CUDA and MPI are properly exposed to ThreadSanitizer to detect data races of data dependent MPI and CUDA calls.
 
+#### Example usage for MPI
+Given the file [03_cuda_to_mpi.c](test/runtime/03_cuda_to_mpi.c), execute the following for CUDA data race detection:
+
+```bash
+$ cusan-mpic++ -O3 -g 03_cuda_to_mpi.c -x cuda -gencode arch=compute_70,code=sm_70 -o cuda_to_mpi.exe
+$ LD_PRELOAD=/path/to/libCusanMPIInterceptor.so mpirun -n 2 ./cuda_to_mpi.exe
+```
+
+*Note*: For avoiding false positives, ThreadSanitizer suppression files might be needed, see for example [suppression.txt](test/runtime/suppressions.txt), or documentation for [sanitizer special case lists](https://clang.llvm.org/docs/SanitizerSpecialCaseList.html).
 
 #### Example report
-The following is an example report for [03_cuda_to_mpi.c](test/pass/03_cuda_to_mpi.c) of our test suite, where the necessary synchronization is not called:
+The following is an example report for [03_cuda_to_mpi.c](test/runtime/03_cuda_to_mpi.c) of our test suite, where the necessary synchronization is not called:
 ```c
-L.23  __global__ void kernel(int* arr, const int N)
+L.18  __global__ void kernel(int* arr, const int N)
 ...
-L.58  int* d_data;
-L.59  cudaMalloc(&d_data, size * sizeof(int));
-L.60
-L.61  if (world_rank == 0) {
-L.62    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
-L.63  #ifdef CUSAN_SYNC
-L.64    cudaDeviceSynchronize();  // CUSAN_SYNC needs to be defined
-L.65  #endif
-L.66    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+L.53  int* d_data;
+L.54  cudaMalloc(&d_data, size * sizeof(int));
+L.55
+L.56  if (world_rank == 0) {
+L.57    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+L.58  #ifdef CUSAN_SYNC
+L.59    cudaDeviceSynchronize();  // CUSAN_SYNC needs to be defined
+L.60  #endif
+L.61    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
 ```
 ```
 ==================
-WARNING: ThreadSanitizer: data race (pid=689288)
-  Read of size 8 at 0x7fb51f200000 by main thread:
-    #0 main cusan/test/pass/03_cuda_to_mpi.c:66:5 (03_cuda_to_mpi.c.exe+0x4e8448)
+WARNING: ThreadSanitizer: data race (pid=579145)
+  Read of size 8 at 0x7f1587200000 by main thread:
+    #0 main cusan/test/runtime/03_cuda_to_mpi.c:61:5 (03_cuda_to_mpi.c.exe+0xfad11)
 
-  Previous write of size 8 at 0x7fb51f200000 by thread T6:
-    #0 __device_stub__kernel(int*, int) cusan/test/pass/03_cuda_to_mpi.c:23:47 (03_cuda_to_mpi.c.exe+0x4e81ef)
+  Previous write of size 8 at 0x7f1587200000 by thread T6:
+    #0 __device_stub__kernel(int*, int) cusan/test/runtime/03_cuda_to_mpi.c:18:47 (03_cuda_to_mpi.c.exe+0xfaaed)
 
-  Thread T6 'cuda_stream' (tid=0, running) created by main thread at:
-    #0 __pool_create_fiber_dbg cusan/build/_deps/fiber_pool-src/fiberpool.cpp:538:16 (libCusanFiberpool-d.so+0x1c152)
-    #1 main cusan/test/pass/03_cuda_to_mpi.c:59:3 (03_cuda_to_mpi.c.exe+0x4e8331)
+  Thread T6 'cuda_stream 0' (tid=0, running) created by main thread at:
+    #0 cusan::runtime::Runtime::register_stream(cusan::runtime::Stream) <null> (libCusanRuntime.so+0x3b830)
+    #1 main cusan/test/runtime/03_cuda_to_mpi.c:54:3 (03_cuda_to_mpi.c.exe+0xfabc7)
 
-SUMMARY: ThreadSanitizer: data race cusan/test/pass/03_cuda_to_mpi.c:66:5 in main
+SUMMARY: ThreadSanitizer: data race cusan/test/runtime/03_cuda_to_mpi.c:61:5 in main
 ==================
 ThreadSanitizer: reported 1 warnings
 ```
 
-## Building cusan
-
-cusan requires LLVM version 14 and CMake version >= 3.20. Use CMake presets `develop` or `release`
+#### Caveats ThreadSanitizer and OpenMPI
+Known issues (on the Lichtenberg HPC system) to make ThreadSanitizer work with OpenMPI 4.1.6:
+- Intel Compute Runtime requires environment flags to work with sanitizers, see [Intel Compute Runtime issue 376](https://github.com/intel/compute-runtime/issues/376):
+  ```bash
+  export NEOReadDebugKeys=1
+  export DisableDeepBind=1
+  ```
+- The sanitizer memory interceptor does not play well with OpenMPI's, see [OpenMPI issue 12819](https://github.com/open-mpi/ompi/issues/12819). Need to disable *patcher*:
+  ```bash
+  export OMPI_MCA_memory=^patcher
+  ```
+
+## Building CuSan
+
+CuSan is tested with LLVM version 14 and 18, and CMake version >= 3.20. Use CMake presets `develop` or `release`
 to build.
 
 ### Dependencies
-CuSan was tested with:
-- System modules: `1) gcc/11.2.0 2) cuda/11.8 3) openmpi/4.1.6 4) git/2.40.0 5) python/3.10.10 6) clang/14.0.6`
-- External libraries: TypeART (https://github.com/tudasc/TypeART/tree/feat/cuda), FiberPool (optional, default off)
+CuSan was tested on the TUDa Lichtenberg II cluster with:
+- System modules: `1) gcc/11.2.0 2) cuda/11.8 3) openmpi/4.1.6 4) git/2.40.0 5) python/3.10.10 6) clang/14.0.6 or 6) clang/18.1.8`
+- Optional external libraries: [TypeART](https://github.com/tudasc/TypeART/tree/v1.9.0b-cuda.1), FiberPool (both default off)
 - Testing: llvm-lit, FileCheck
 - GPU: Tesla T4 and Tesla V100 (mostly: arch=sm_70)
 
 ### Build example
 
-cusan uses CMake to build. Example build recipe (release build, installs to default prefix
+CuSan uses CMake to build. Example build recipe (release build, installs to default prefix
 `${cusan_SOURCE_DIR}/install/cusan`)
 
 ```sh
@@ -80,7 +110,9 @@ $> cmake --build build --target install --parallel
 
 | Option                       | Default | Description                                                                                       |
 |------------------------------|:-------:|---------------------------------------------------------------------------------------------------|
+| `CUSAN_TYPEART`              |  `OFF`  | Use TypeART library to track memory allocations.                                      |
 | `CUSAN_FIBERPOOL`            |  `OFF`  | Use external library to efficiently manage fibers creation .                                      |
 | `CUSAN_SOFTCOUNTER`          |  `OFF`  | Runtime stats for calls to ThreadSanitizer and CUDA-callbacks. Only use for stats collection, not race detection.   |
 | `CUSAN_SYNC_DETAIL_LEVEL`    |  `ON`   | Analyze, e.g., memcpy and memcpyasync w.r.t. arguments to determine implicit sync.                |
 | `CUSAN_LOG_LEVEL_RT`         |  `3`    | Granularity of runtime logger. 3 is most verbose, 0 is least. For release, set to 0.              |
+| `CUSAN_LOG_LEVEL_PASS`         |  `3`    | Granularity of pass plugin logger. 3 is most verbose, 0 is least. For release, set to 0.              |
diff --git a/cmake/cusanToolchain.cmake b/cmake/cusanToolchain.cmake
index 489cd29..7bdab29 100644
--- a/cmake/cusanToolchain.cmake
+++ b/cmake/cusanToolchain.cmake
@@ -28,21 +28,15 @@ string(COMPARE EQUAL "${CMAKE_SOURCE_DIR}" "${PROJECT_SOURCE_DIR}"
 find_package(CUDAToolkit REQUIRED)
 find_package(MPI REQUIRED)
 
-FetchContent_Declare(
-  typeart
-  GIT_REPOSITORY https://github.com/tudasc/TypeART.git
-  GIT_TAG v1.9.0b-cuda.1
-  GIT_SHALLOW 1
-)
-FetchContent_MakeAvailable(typeart)
-
 option(CUSAN_TEST_CONFIGURE_IDE "Add targets for tests to help the IDE with completion etc." ON)
 mark_as_advanced(CUSAN_TEST_CONFIGURE_IDE)
 option(CUSAN_CONFIG_DIR_IS_SHARE "Install to \"share/cmake/\" instead of \"lib/cmake/\"" OFF)
 mark_as_advanced(CUSAN_CONFIG_DIR_IS_SHARE)
 
 set(CUSAN_LOG_LEVEL_RT 3 CACHE STRING "Granularity of runtime logger. 3 is most verbose, 0 is least.")
+set(CUSAN_LOG_LEVEL_PASS 3  CACHE STRING "Granularity of transform pass logger. 3 is most verbose, 0 is least.")
 
+option(CUSAN_TYPEART "Use external typeart to track allocations" OFF)
 option(CUSAN_FIBERPOOL "Use external fiber pool to manage ThreadSanitizer fibers" OFF)
 option(CUSAN_SOFTCOUNTER "Print runtime counters" OFF)
 option(CUSAN_SYNC_DETAIL_LEVEL "Enable implicit sync analysis of memcpy/memset" ON)
@@ -73,7 +67,7 @@ include(modules/cusan-format)
 include(modules/cusan-target-util)
 
 cusan_find_llvm_progs(CUSAN_CLANG_EXEC "clang-${LLVM_VERSION_MAJOR};clang" DEFAULT_EXE "clang")
-cusan_find_llvm_progs(CUSAN_CLANGCXX_EXEC "clang-${LLVM_VERSION_MAJOR};clang++" DEFAULT_EXE "clang++")
+cusan_find_llvm_progs(CUSAN_CLANGCXX_EXEC "clang++-${LLVM_VERSION_MAJOR};clang++" DEFAULT_EXE "clang++")
 cusan_find_llvm_progs(CUSAN_LLC_EXEC "llc-${LLVM_VERSION_MAJOR};llc" DEFAULT_EXE "llc")
 cusan_find_llvm_progs(CUSAN_OPT_EXEC "opt-${LLVM_VERSION_MAJOR};opt" DEFAULT_EXE "opt")
 
diff --git a/cmake/modules/cusan-llvm.cmake b/cmake/modules/cusan-llvm.cmake
index bb162ad..d673cee 100644
--- a/cmake/modules/cusan-llvm.cmake
+++ b/cmake/modules/cusan-llvm.cmake
@@ -20,7 +20,7 @@ function(cusan_llvm_module name sources)
   )
 
   if(ARG_INCLUDE_DIRS)
-    target_include_directories(${name}
+    target_include_directories(${name} ${warning_guard}
       PRIVATE
       ${ARG_INCLUDE_DIRS}
     )
diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt
index c0823ae..863ccc1 100644
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@@ -31,4 +31,14 @@ if(CUSAN_FIBERPOOL)
     FETCHCONTENT_SOURCE_DIR_FIBER_POOL
     FETCHCONTENT_UPDATES_DISCONNECTED_FIBER_POOL
     )
-endif()
\ No newline at end of file
+endif()
+
+if(CUSAN_TYPEART)
+    FetchContent_Declare(
+    typeart
+    GIT_REPOSITORY https://github.com/tudasc/TypeART.git
+    GIT_TAG v1.9.0b-cuda.1
+    GIT_SHALLOW 1
+    )
+    FetchContent_MakeAvailable(typeart)
+endif()
diff --git a/lib/analysis/CMakeLists.txt b/lib/analysis/CMakeLists.txt
index 6c13540..4e8f218 100644
--- a/lib/analysis/CMakeLists.txt
+++ b/lib/analysis/CMakeLists.txt
@@ -22,6 +22,8 @@ target_include_directories(cusan_Analysis ${warning_guard}
 
 cusan_target_define_file_basename(cusan_Analysis)
 
+target_compile_definitions(cusan_Analysis PRIVATE "LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}")
+
 set(CONFIG_NAME cusanAnalysis)
 set(TARGETS_EXPORT_NAME ${CONFIG_NAME}Targets)
 
diff --git a/lib/analysis/KernelAnalysis.cpp b/lib/analysis/KernelAnalysis.cpp
index 62dc1c4..4802c0c 100644
--- a/lib/analysis/KernelAnalysis.cpp
+++ b/lib/analysis/KernelAnalysis.cpp
@@ -181,112 +181,107 @@ inline AccessState state(const llvm::Attribute::AttrKind mem) {
   return AccessState::kRW;
 }
 
-struct ChildInfo {
-  llvm::Value* val;
-  llvm::SmallVector<int32_t> indices;
-};
+void collect_subsequent_load(FunctionArg& arg, llvm::Value* value, llvm::SmallVector<int64_t> index_stack) {
+  using namespace llvm;
+  for (User* value_user : value->users()) {
+    if (auto* load = dyn_cast<LoadInst>(value_user)) {
+      if (load->getType()->isPointerTy()) {
+        const auto res = determinePointerAccessAttrs(load);
+        const FunctionSubArg sub_arg{load, true, index_stack, true, state(res)};
+        arg.subargs.push_back(sub_arg);
+      }
+    }
+  }
+}
 
-void collect_children(FunctionArg& arg, llvm::Value* init_val, llvm::SmallVector<int32_t> initial_index_stack = {},
-                      llvm::SmallSet<llvm::Function*, 8> visited_funcs = {}) {
+void collect_children(FunctionArg& arg, llvm::Value* value, llvm::SmallSet<llvm::Function*, 8>& visited_funcs) {
   using namespace llvm;
-  llvm::SmallVector<ChildInfo, 32> work_list;
-  work_list.push_back({init_val, std::move(initial_index_stack)});
-
-  while (!work_list.empty()) {
-    // not nice making copies of the stack all the time idk
-    auto curr_info   = work_list.pop_back_val();
-    auto* value      = curr_info.val;
-    auto index_stack = curr_info.indices;
-
-    Type* value_type = value->getType();
-    if (auto* ptr_type = dyn_cast<PointerType>(value_type)) {
-      auto* elem_type = ptr_type->getPointerElementType();
-      if (elem_type->isStructTy() || elem_type->isPointerTy()) {
-        for (User* value_user : value->users()) {
-          if (auto* call = dyn_cast<CallBase>(value_user)) {
-            Function* called = call->getCalledFunction();
-            if (visited_funcs.contains(called)) {
-              LOG_WARNING("Not handling recursive kernels right now");
-              continue;
-            }
-            if (called->isDeclaration()) {
-              LOG_WARNING("Could not determine pointer access of the "
-                          << arg.arg_pos
-                          << " Argument since its calling function outside of this cu: " << called->getName());
-              continue;
-            }
-            visited_funcs.insert(called);
-            Argument* ipo_argument = called->getArg(arg.arg_pos);
-            {
-              const auto access_res = determinePointerAccessAttrs(ipo_argument);
-              // const FunctionSubArg sub_arg{ipo_argument, index_stack, true, state(access_res)};
-              // arg.subargs.push_back(sub_arg);
-              //  this argument should have already been looked at in the current function so if we
-              //  check it again we should merge the results to get the correct accessstate
-              auto* res =
-                  llvm::find_if(arg.subargs, [=](auto a) { return a.value.getValueOr(nullptr) == ipo_argument; });
-              if (res == arg.subargs.end()) {
-                res->state = mergeAccessState(res->state, state(access_res));
-              } else {
-                assert(false);
-              }
-            }
-            collect_children(arg, ipo_argument, index_stack);
-          } else if (auto* gep = dyn_cast<GetElementPtrInst>(value_user)) {
-            auto gep_indicies    = gep->indices();
-            auto sub_index_stack = index_stack;
-            for (unsigned i = 1; i < gep->getNumIndices(); i++) {
-              auto* index = gep_indicies.begin() + i;
-              if (auto* index_value = dyn_cast<ConstantInt>(index->get())) {
-                sub_index_stack.push_back((int32_t)index_value->getSExtValue());
-                work_list.push_back({gep, sub_index_stack});
-              } else {
-                LOG_WARNING("Failed to determine access pattern for argument '"
-                            << arg.arg_pos << "' since it uses dynamic gep indices");
-                break;
-              }
-            }
+
+  Type* value_type = value->getType();
+  if (auto* ptr_type = dyn_cast<PointerType>(value_type)) {
+    // auto* elem_type = ptr_type->getPointerElementType();
+    //  if (elem_type->isStructTy() || elem_type->isPointerTy()) {
+    for (Use& value_use : value->uses()) {
+      User* value_user = value_use.getUser();
+      if (auto* call = dyn_cast<CallBase>(value_user)) {
+        Function* called = call->getCalledFunction();
+        if (visited_funcs.contains(called)) {
+          LOG_WARNING("Not handling recursive kernels right now");
+          continue;
+        }
+        if (called->isDeclaration()) {
+          LOG_WARNING("Could not determine pointer access of the "
+                      << arg.arg_pos
+                      << " Argument since its calling function outside of this cu: " << called->getName());
+          continue;
+        }
+        visited_funcs.insert(called);
+
+        Argument* ipo_argument = called->getArg(value_use.getOperandNo());
+        {
+          const auto access_res = determinePointerAccessAttrs(ipo_argument);
+          // const FunctionSubArg sub_arg{ipo_argument, index_stack, true, state(access_res)};
+          // arg.subargs.push_back(sub_arg);
+          //  this argument should have already been looked at in the current function so if we
+          //  check it again we should merge the results to get the correct accessstate
+          auto* res = llvm::find_if(arg.subargs, [=](auto a) { return a.value.value_or(nullptr) == ipo_argument; });
+          if (res == arg.subargs.end()) {
+            res->state = mergeAccessState(res->state, state(access_res));
+          } else {
+            assert(false);
           }
         }
-      }
-      //{
-      //  const auto res = determinePointerAccessAttrs(load);
-      //  const FunctionArg kernel_arg{load, index_stack, arg_pos, true, state(res)};
-      //  args.push_back(kernel_arg);
-      //}
-      for (User* value_user : value->users()) {
-        if (auto* load = dyn_cast<LoadInst>(value_user)) {
-          if (load->getType()->isPointerTy()) {
-            auto sub_index_stack = index_stack;
-            sub_index_stack.push_back(-1);
-            work_list.push_back({load, sub_index_stack});
-            const auto res = determinePointerAccessAttrs(load);
-            const FunctionSubArg sub_arg{load, std::move(sub_index_stack), true, state(res)};
-            arg.subargs.push_back(sub_arg);
+        collect_children(arg, ipo_argument, visited_funcs);
+      } else if (auto* gep = dyn_cast<GetElementPtrInst>(value_user)) {
+        auto gep_indicies                  = gep->indices();
+        llvm::SmallVector<int64_t> indices = {};
+        bool all_constant                  = true;
+
+        for (unsigned i = 0; i < gep->getNumIndices(); i++) {
+          auto* index = gep_indicies.begin() + i;
+          if (auto* index_value = dyn_cast<ConstantInt>(index->get())) {
+            indices.push_back(index_value->getSExtValue());
+          } else {
+            LOG_WARNING("Failed to determine access pattern for argument '" << arg.arg_pos
+                                                                            << "' since it uses dynamic gep indices");
+            all_constant = false;
+            break;
           }
         }
+        if (all_constant) {
+          // const FunctionSubArg sub_arg{value, true, indices, true, state(res)};
+          // arg.subargs.push_back(sub_arg);
+          collect_subsequent_load(arg, gep, std::move(indices));
+          // work_list.push_back({gep, sub_index_stack});
+        }
       }
+    }
 
-    } else {
-      return;
+    for (User* value_user : value->users()) {
+      if (dyn_cast<LoadInst>(value_user)) {
+        collect_subsequent_load(arg, value, {});
+      }
     }
+  } else {
+    return;
   }
 }
 
 void attribute_value(FunctionArg& arg) {
   using namespace llvm;
-  assert(arg.value.hasValue());
-  auto* value      = arg.value.getValue();
+  assert(arg.value.has_value());
+  auto* value      = arg.value.value();
   Type* value_type = value->getType();
   if (value_type->isPointerTy()) {
     const auto res2 = determinePointerAccessAttrs(value);
-    const FunctionSubArg kernel_arg{value, {}, true, state(res2)};
+    const FunctionSubArg kernel_arg{value, false, {}, true, state(res2)};
     arg.is_pointer = true;
     arg.value      = value;
     arg.subargs.emplace_back(kernel_arg);
-    collect_children(arg, value);
+    llvm::SmallSet<llvm::Function*, 8> visited_funcs = {};
+    collect_children(arg, value, visited_funcs);
   } else {
-    const FunctionSubArg kernel_arg{value, {}, false, AccessState::kRW};
+    const FunctionSubArg kernel_arg{value, false, {}, false, AccessState::kRW};
     arg.subargs.emplace_back(kernel_arg);
   }
 }
@@ -294,18 +289,6 @@ void attribute_value(FunctionArg& arg) {
 std::optional<KernelModel> info_with_attributor(llvm::Function* kernel) {
   using namespace llvm;
 
-  auto* module = kernel->getParent();
-  AnalysisGetter ag;
-  SetVector<Function*> functions;
-  for (auto& module_f : module->functions()) {
-    functions.insert(&module_f);
-  }
-  CallGraphUpdater cg_updater;
-  BumpPtrAllocator allocator;
-  InformationCache info_cache(*module, ag, allocator, /* CGSCC */ nullptr);
-
-  Attributor attrib(functions, info_cache, cg_updater);
-
   LOG_DEBUG("Attributing " << kernel->getName() << "\n" << *kernel << "\n")
 
   llvm::SmallVector<FunctionArg, 4> args{};
@@ -349,10 +332,14 @@ std::optional<KernelModel> kernel_model_for_stub(llvm::Function* func, const Mod
       stub_name.erase(pos, prefix.length());
     }
     return stub_name;
-  }(util::try_demangle(*func));
+  }(util::try_demangle_fully(*func));
 
   const auto result = llvm::find_if(models.models, [&stub_name](const auto& model_) {
-    return llvm::StringRef(util::demangle(model_.kernel_name)).startswith(stub_name);
+#if LLVM_VERSION_MAJOR > 15
+    return llvm::StringRef(util::try_demangle_fully(model_.kernel_name)).starts_with(stub_name);
+#else
+    return llvm::StringRef(util::try_demangle_fully(model_.kernel_name)).startswith(stub_name);
+#endif
   });
 
   if (result != std::end(models.models)) {
diff --git a/lib/analysis/KernelModel.cpp b/lib/analysis/KernelModel.cpp
index 8eab472..f19eca4 100644
--- a/lib/analysis/KernelModel.cpp
+++ b/lib/analysis/KernelModel.cpp
@@ -43,35 +43,39 @@ bool ModelHandler::insert(const cusan::KernelModel& model) {
 
   return false;
 }
+
 llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const FunctionSubArg& arg) {
   os << "[";
-  if (arg.value.hasValue()) {
-    os << *arg.value.getValue();
+  if (arg.value.has_value()) {
+    os << *arg.value.value();
   } else {
     os << "<null>";
   }
-  if (!arg.indices.empty()) {
-    os << ", indices:[";
-    for (auto index : arg.indices) {
+  if (arg.does_load) {
+    os << ", is_loading";
+  }
+  if (!arg.gep_indicies.empty()) {
+    os << ", gep_indices:[";
+    for (const auto& index : arg.gep_indicies) {
       os << index << ", ";
     }
     os << "]";
   } else {
-    os << ", indices:[]";
+    os << ", gep_indices:[]";
   }
   os << ", ptr: " << static_cast<int>(arg.is_pointer) << ", rw: " << arg.state << "]";
   return os;
 }
 llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const FunctionArg& arg) {
   os << "[";
-  if (arg.value.hasValue()) {
-    os << *arg.value.getValue();
+  if (arg.value.has_value()) {
+    os << *arg.value.value();
   } else {
     os << "<null>";
   }
   os << ", subArgs: [";
-  for (const auto& arg : arg.subargs) {
-    os << arg;
+  for (const auto& sub_arg : arg.subargs) {
+    os << sub_arg;
   }
   os << "]";
   os << ", ptr: " << static_cast<int>(arg.is_pointer) << ", pos: " << arg.arg_pos << "]";
diff --git a/lib/analysis/KernelModel.h b/lib/analysis/KernelModel.h
index 84a8a16..cc9a4df 100644
--- a/lib/analysis/KernelModel.h
+++ b/lib/analysis/KernelModel.h
@@ -7,7 +7,6 @@
 #ifndef CUSAN_KERNELMODEL_H
 #define CUSAN_KERNELMODEL_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Function.h"
@@ -16,6 +15,7 @@
 
 #include <optional>
 #include <string_view>
+#include <utility>
 
 namespace cusan {
 
@@ -47,23 +47,24 @@ inline constexpr const char* access_state_string(AccessState state) {
 }
 
 struct FunctionSubArg {
-  llvm::Optional<llvm::Value*> value{nullptr};
-  llvm::SmallVector<int32_t> indices;  // gep and loads needed to get the argument from 'actual' args
+  std::optional<llvm::Value*> value = std::nullopt;
+  bool does_load;
+  llvm::SmallVector<int64_t> gep_indicies;  // gep and loads needed to get the argument from 'actual' args
   bool is_pointer{false};
   AccessState state{AccessState::kRW};
 };
 
 struct FunctionArg {
-  llvm::Optional<llvm::Value*> value{nullptr};
+  std::optional<llvm::Value*> value = std::nullopt;
   unsigned arg_pos{0};
   bool is_pointer{false};
   llvm::SmallVector<FunctionSubArg> subargs;
 };
 
 struct KernelModel {
-  llvm::Optional<const llvm::Function*> kernel{nullptr};
-  std::string kernel_name{};
-  llvm::SmallVector<FunctionArg, 4> args{};
+  std::optional<const llvm::Function*> kernel = std::nullopt;
+  std::string kernel_name;
+  llvm::SmallVector<FunctionArg, 4> args;
 };
 
 struct ModelHandler {
diff --git a/lib/analysis/ModelIO.cpp b/lib/analysis/ModelIO.cpp
index d3d9688..e30ef3b 100644
--- a/lib/analysis/ModelIO.cpp
+++ b/lib/analysis/ModelIO.cpp
@@ -53,7 +53,7 @@ template <>
 struct llvm::yaml::MappingTraits<cusan::FunctionArg> {
   static void mapping(IO& io, cusan::FunctionArg& info) {
     if (!io.outputting()) {
-      info.value = llvm::None;
+      info.value = std::nullopt;
     }
     io.mapRequired("position", info.arg_pos);
     io.mapRequired("pointer", info.is_pointer);
@@ -67,9 +67,10 @@ template <>
 struct llvm::yaml::MappingTraits<cusan::FunctionSubArg> {
   static void mapping(IO& io, cusan::FunctionSubArg& info) {
     if (!io.outputting()) {
-      info.value = llvm::None;
+      info.value = std::nullopt;
     }
-    io.mapRequired("indices", info.indices);
+    io.mapRequired("does_load", info.does_load);
+    io.mapRequired("gep_indicies", info.gep_indicies);
     io.mapRequired("access", info.state);
     io.mapRequired("pointer", info.is_pointer);
   }
@@ -81,7 +82,7 @@ template <>
 struct llvm::yaml::MappingTraits<cusan::KernelModel> {
   static void mapping(IO& io, cusan::KernelModel& info) {
     if (!io.outputting()) {
-      info.kernel = llvm::None;
+      info.kernel = std::nullopt;
     }
     io.mapRequired("name", info.kernel_name);
     io.mapRequired("args", info.args);
diff --git a/lib/pass/AnalysisTransform.cpp b/lib/pass/AnalysisTransform.cpp
new file mode 100644
index 0000000..d539dae
--- /dev/null
+++ b/lib/pass/AnalysisTransform.cpp
@@ -0,0 +1,624 @@
+#include "AnalysisTransform.h"
+
+#include "support/Logger.h"
+#include "support/Util.h"
+
+namespace cusan {
+auto get_void_ptr_type(IRBuilder<>& irb) {
+#if LLVM_VERSION_MAJOR >= 15
+  return irb.getPtrTy();
+#else
+  return irb.getInt8PtrTy();
+#endif
+}
+
+namespace analysis {
+namespace helper {
+
+bool does_name_match(const std::string& model_kernel_name, llvm::CallBase& cb) {
+  assert(cb.getFunction() != nullptr && "Callbase requires function.");
+  const auto stub_name      = util::try_demangle_fully(*cb.getFunction());
+  const auto searching_name = util::try_demangle_fully(model_kernel_name);
+
+  StringRef searching_without_type{searching_name};
+  if (StringRef{stub_name}.contains("lambda")) {
+    LOG_DEBUG("Detected lambda function in stub name " << stub_name)
+    // if we got a lambda it has a return type included that we want to shave off
+    const auto first_space = searching_name.find(' ');
+    searching_without_type = llvm::StringRef(searching_name).substr(first_space + 1);
+  }
+
+  LOG_DEBUG("Check stub \"" << stub_name << "\" ends with \"" << searching_name << "\" or \"" << searching_without_type
+                            << "\"")
+  return helper::ends_with_any_of(stub_name, searching_name, searching_without_type);
+}
+}  // namespace helper
+
+std::optional<CudaKernelInvokeCollector::KernelInvokeData> CudaKernelInvokeCollector::match(llvm::CallBase& cb,
+                                                                                            Function& callee) const {
+  if (callee.getName() == "cudaLaunchKernel" && helper::does_name_match(model.kernel_name, cb)) {
+    // && ends_with(stub_name, searching_name)
+    // errs() << "Func:" << stub_name << " " << searching_name << "  == " << (stub_name == searching_name) << "\n";
+    // errs() << cb.getFunction()->getName() << "  " << model.kernel_name << "\n" << cb << "\n";
+
+    auto* cu_stream_handle      = std::prev(cb.arg_end())->get();
+    auto* void_kernel_arg_array = std::prev(cb.arg_end(), 3)->get();
+    auto kernel_args            = extract_kernel_args_for(void_kernel_arg_array);
+
+    return KernelInvokeData{kernel_args, void_kernel_arg_array, cu_stream_handle};
+  }
+  return std::nullopt;
+}
+
+llvm::SmallVector<KernelArgInfo, 4> CudaKernelInvokeCollector::extract_kernel_args_for(
+    llvm::Value* void_kernel_arg_array) const {
+  unsigned index = 0;
+
+  llvm::SmallVector<Value*, 4> real_args;
+
+  for (auto* array_user : void_kernel_arg_array->users()) {
+    if (auto* gep = dyn_cast<GetElementPtrInst>(array_user)) {
+      for (auto* gep_user : gep->users()) {
+        if (auto* store = dyn_cast<StoreInst>(gep_user)) {
+          if (!(index < model.args.size())) {
+            LOG_FATAL("In: " << *store->getParent()->getParent())
+            LOG_FATAL("Out of bounds for model args: " << index << " vs. " << model.args.size());
+            assert(false && "Encountered out of bounds access");
+          }
+          if (auto* cast = dyn_cast<BitCastInst>(store->getValueOperand())) {
+            real_args.push_back(*cast->operand_values().begin());
+          } else {
+            real_args.push_back(*store->operand_values().begin());
+          }
+          index++;
+        }
+      }
+    }
+  }
+
+  llvm::SmallVector<KernelArgInfo, 4> result = model.args;
+  for (auto& res : result) {
+    Value* val = real_args[real_args.size() - 1 - res.arg_pos];
+    // because of ABI? clang might convert struct argument to a (byval)pointer
+    // but the actual cuda argument is just a value. So we double check that it actually allocates a pointer
+    bool real_ptr = false;
+    if (auto* as_alloca = dyn_cast<AllocaInst>(val)) {
+      real_ptr = res.is_pointer && as_alloca->getAllocatedType()->isPointerTy();
+    }
+
+    // not fake pointer from clang so load it before getting subargs
+    for (auto& sub_arg : res.subargs) {
+      if (real_ptr) {
+        sub_arg.does_load = true;
+        sub_arg.gep_indicies.clear();
+      }
+      sub_arg.value = val;
+    }
+    res.value = val;
+  }
+  return result;
+}
+}  // namespace analysis
+}  // namespace cusan
+
+namespace cusan::transform {
+
+bool KernelInvokeTransformer::transform(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) const {
+  using namespace llvm;
+  return generate_compound_cb(data, irb);
+}
+
+short KernelInvokeTransformer::access_cast(AccessState access, bool is_ptr) {
+  auto value = static_cast<short>(access);
+  value <<= 1;
+  if (is_ptr) {
+    value |= 1;
+  }
+  return value;
+}
+
+llvm::Value* KernelInvokeTransformer::get_cu_stream_ptr(const analysis::CudaKernelInvokeCollector::Data& data,
+                                                        IRBuilder<>& irb) {
+  auto* cu_stream = data.cu_stream;
+  assert(cu_stream != nullptr && "Require cuda stream!");
+  auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(cu_stream, get_void_ptr_type(irb));
+  return cu_stream_void_ptr;
+}
+
+bool KernelInvokeTransformer::generate_compound_cb(const analysis::CudaKernelInvokeCollector::Data& data,
+                                                   IRBuilder<>& irb) const {
+  const bool should_transform =
+      llvm::count_if(data.args, [&](const auto& elem) {
+        return llvm::count_if(elem.subargs, [&](const auto& sub_elem) { return sub_elem.is_pointer; }) > 0;
+      }) > 0;
+
+  uint32_t n_subargs = 0;
+  for (const auto& arg : data.args) {
+    n_subargs += arg.subargs.size();
+  }
+
+  if (!should_transform) {
+    return false;
+  }
+
+  auto target_callback = decls_->cusan_register_access;
+
+  auto* i16_ty      = Type::getInt16Ty(irb.getContext());
+  auto* i32_ty      = Type::getInt32Ty(irb.getContext());
+  auto* void_ptr_ty = get_void_ptr_type(irb);
+
+  auto* cu_stream_void_ptr = get_cu_stream_ptr(data, irb);
+  auto* arg_size           = irb.getInt32(n_subargs);
+  auto* arg_access_array   = irb.CreateAlloca(i16_ty, arg_size);
+  auto* arg_value_array    = irb.CreateAlloca(void_ptr_ty, arg_size);
+
+  size_t arg_array_index = 0;
+  for (const auto& arg : data.args) {
+    LOG_DEBUG("Handling Arg: " << arg)
+    for (const auto& sub_arg : arg.subargs) {
+      LOG_DEBUG("   subarg: " << sub_arg)
+      const auto access = access_cast(sub_arg.state, sub_arg.is_pointer);
+      Value* idx        = ConstantInt::get(i32_ty, arg_array_index);
+      Value* acc        = ConstantInt::get(i16_ty, access);
+      auto* gep_acc     = irb.CreateGEP(i16_ty, arg_access_array, idx);
+      irb.CreateStore(acc, gep_acc);
+      // only if it is a pointer store the actual pointer in the value array
+      if (sub_arg.is_pointer) {
+        assert(arg.value.has_value());
+        auto* value_ptr = arg.value.value();
+
+        if (auto* alloca_value = dyn_cast_or_null<AllocaInst>(value_ptr)) {
+          auto* subtype = alloca_value->getAllocatedType();
+
+          if (!sub_arg.gep_indicies.empty()) {
+            llvm::SmallVector<Value*> values{
+                llvm::map_range(sub_arg.gep_indicies, [&irb](auto index) { return (Value*)irb.getInt32(index); })};
+            value_ptr = irb.CreateGEP(subtype, value_ptr, values);
+#if LLVM_VERSION_MAJOR >= 15
+            subtype = void_ptr_ty;
+#else
+            subtype = value_ptr->getType()->getPointerElementType();
+#endif
+          }
+
+          if (sub_arg.does_load) {
+            value_ptr = irb.CreateLoad(subtype, value_ptr);
+          }
+        }
+
+        auto* voided_ptr    = irb.CreatePointerCast(value_ptr, void_ptr_ty);
+        auto* gep_val_array = irb.CreateGEP(void_ptr_ty, arg_value_array, idx);
+        irb.CreateStore(voided_ptr, gep_val_array);
+        arg_array_index += 1;
+      }
+    }
+  }
+
+  Value* args_cusan_register[] = {arg_value_array, arg_access_array, arg_size, cu_stream_void_ptr};
+  irb.CreateCall(target_callback.f, args_cusan_register);
+  return true;
+}
+
+// DeviceSyncInstrumenter
+
+DeviceSyncInstrumenter::DeviceSyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaDeviceSynchronize", &decls->cusan_sync_device.f);
+}
+llvm::SmallVector<Value*> DeviceSyncInstrumenter::map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*>) {
+  return {};
+}
+
+// StreamSyncInstrumenter
+
+StreamSyncInstrumenter::StreamSyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaStreamSynchronize", &decls->cusan_sync_stream.f);
+}
+llvm::SmallVector<Value*> StreamSyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 1);
+  Value* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {cu_stream_void_ptr};
+}
+
+// EventSyncInstrumenter
+
+EventSyncInstrumenter::EventSyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaEventSynchronize", &decls->cusan_sync_event.f);
+}
+llvm::SmallVector<Value*> EventSyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 1);
+  auto* cu_event_void_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {cu_event_void_ptr};
+}
+
+// EventRecordInstrumenter
+
+EventRecordInstrumenter::EventRecordInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaEventRecord", &decls->cusan_event_record.f);
+}
+llvm::SmallVector<Value*> EventRecordInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 2);
+  auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[1], get_void_ptr_type(irb));
+  return {cu_event_void_ptr, cu_stream_void_ptr};
+}
+
+// EventRecordFlagsInstrumenter
+
+EventRecordFlagsInstrumenter::EventRecordFlagsInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaEventRecordWithFlags", &decls->cusan_event_record.f);
+}
+llvm::SmallVector<Value*> EventRecordFlagsInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 3);
+  auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[1], get_void_ptr_type(irb));
+  return {cu_event_void_ptr, cu_stream_void_ptr};
+}
+
+// CudaMemcpyAsyncInstrumenter
+
+CudaMemcpyAsyncInstrumenter::CudaMemcpyAsyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemcpyAsync", &decls->cusan_memcpy_async.f);
+}
+llvm::SmallVector<Value*> CudaMemcpyAsyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0
+  assert(args.size() == 5);
+  auto* dst_ptr   = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* src_ptr   = irb.CreateBitOrPointerCast(args[1], get_void_ptr_type(irb));
+  auto* count     = args[2];
+  auto* kind      = args[3];
+  auto* cu_stream = irb.CreateBitOrPointerCast(args[4], get_void_ptr_type(irb));
+  return {dst_ptr, src_ptr, count, kind, cu_stream};
+}
+
+//  CudaMemcpyInstrumenter
+
+CudaMemcpyInstrumenter::CudaMemcpyInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemcpy", &decls->cusan_memcpy.f);
+}
+llvm::SmallVector<Value*> CudaMemcpyInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* dst, const void* src, size_t count, cudaMemcpyKind kind
+  assert(args.size() == 4);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* src_ptr = irb.CreateBitOrPointerCast(args[1], get_void_ptr_type(irb));
+  auto* count   = args[2];
+  auto* kind    = args[3];
+  return {dst_ptr, src_ptr, count, kind};
+}
+
+//  CudaMemcpy2DInstrumenter
+
+CudaMemcpy2DInstrumenter::CudaMemcpy2DInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemcpy2D", &decls->cusan_memcpy_2d.f);
+}
+llvm::SmallVector<Value*> CudaMemcpy2DInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, cusan_MemcpyKind kind
+  assert(args.size() == 7);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* dpitch  = args[1];
+  auto* src_ptr = irb.CreateBitOrPointerCast(args[2], get_void_ptr_type(irb));
+  auto* spitch  = args[3];
+  auto* width   = args[4];
+  auto* height  = args[5];
+  auto* kind    = args[6];
+  return {dst_ptr, dpitch, src_ptr, spitch, width, height, kind};
+}
+
+// CudaMemcpy2DAsyncInstrumenter
+
+CudaMemcpy2DAsyncInstrumenter::CudaMemcpy2DAsyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemcpy2DAsync", &decls->cusan_memcpy_2d_async.f);
+}
+llvm::SmallVector<Value*> CudaMemcpy2DAsyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, cusan_MemcpyKind kind,
+  // stream
+  assert(args.size() == 8);
+  auto* dst_ptr   = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* dpitch    = args[1];
+  auto* src_ptr   = irb.CreateBitOrPointerCast(args[2], get_void_ptr_type(irb));
+  auto* spitch    = args[3];
+  auto* width     = args[4];
+  auto* height    = args[5];
+  auto* kind      = args[6];
+  auto* cu_stream = irb.CreateBitOrPointerCast(args[7], get_void_ptr_type(irb));
+  return {dst_ptr, dpitch, src_ptr, spitch, width, height, kind, cu_stream};
+}
+
+// CudaMemsetAsyncInstrumenter
+
+CudaMemsetAsyncInstrumenter::CudaMemsetAsyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemsetAsync", &decls->cusan_memset_async.f);
+}
+llvm::SmallVector<Value*> CudaMemsetAsyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* devPtr, int  value, size_t count, cudaStream_t stream = 0 )
+  assert(args.size() == 4);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  // auto* value     = args[1];
+  auto* count     = args[2];
+  auto* cu_stream = irb.CreateBitOrPointerCast(args[3], get_void_ptr_type(irb));
+  return {dst_ptr, count, cu_stream};
+}
+
+// CudaMemsetInstrumenter
+
+CudaMemsetInstrumenter::CudaMemsetInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemset", &decls->cusan_memset.f);
+}
+llvm::SmallVector<Value*> CudaMemsetInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* devPtr, int  value, size_t count,)
+  assert(args.size() == 3);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  // auto* value   = args[1];
+  auto* count = args[2];
+  return {dst_ptr, count};
+}
+
+// CudaMemset2dAsyncInstrumenter
+
+CudaMemset2dAsyncInstrumenter::CudaMemset2dAsyncInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemset2DAsync", &decls->cusan_memset_2d_async.f);
+}
+llvm::SmallVector<Value*> CudaMemset2dAsyncInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* devPtr, size_t pitch, int  value, size_t width, size_t height, cudaStream_t stream = 0
+  assert(args.size() == 6);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* pitch   = args[1];
+  // auto* value     = args[2];
+  auto* height    = args[3];
+  auto* width     = args[4];
+  auto* cu_stream = irb.CreateBitOrPointerCast(args[5], get_void_ptr_type(irb));
+  return {dst_ptr, pitch, height, width, cu_stream};
+}
+
+// CudaMemset2dInstrumenter
+
+CudaMemset2dInstrumenter::CudaMemset2dInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaMemset2D", &decls->cusan_memset_2d.f);
+}
+llvm::SmallVector<Value*> CudaMemset2dInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  // void* devPtr, size_t pitch, int  value, size_t width, size_t height
+  assert(args.size() == 5);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* pitch   = args[1];
+  // auto* value   = args[2];
+  auto* height = args[3];
+  auto* width  = args[4];
+  ;
+  return {dst_ptr, pitch, height, width};
+}
+
+// CudaHostAlloc
+
+CudaHostAlloc::CudaHostAlloc(callback::FunctionDecl* decls) {
+  setup("cudaHostAlloc", &decls->cusan_host_alloc.f);
+}
+llvm::SmallVector<Value*> CudaHostAlloc::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void** ptr, size_t size, unsigned int flags )
+  assert(args.size() == 3);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* size    = args[1];
+  auto* flags   = args[2];
+  return {dst_ptr, size, flags};
+}
+
+// CudaMallocHost
+
+CudaMallocHost::CudaMallocHost(callback::FunctionDecl* decls) {
+  setup("cudaMallocHost", &decls->cusan_host_alloc.f);
+}
+llvm::SmallVector<Value*> CudaMallocHost::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void** ptr, size_t size)
+  assert(args.size() == 2);
+  auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* size    = args[1];
+  auto* flags   = llvm::ConstantInt::get(Type::getInt32Ty(irb.getContext()), 0, false);
+  return {dst_ptr, size, flags};
+}
+
+// CudaEventCreateInstrumenter
+
+CudaEventCreateInstrumenter::CudaEventCreateInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaEventCreate", &decls->cusan_event_create.f);
+}
+llvm::SmallVector<Value*> CudaEventCreateInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 1);
+  // auto* cu_event_void_ptr = irb.CreateLoad(get_void_ptr_type(irb), args[0], "");
+  auto* cu_event_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {cu_event_void_ptr_ptr};
+}
+
+// CudaEventCreateWithFlagsInstrumenter
+
+CudaEventCreateWithFlagsInstrumenter::CudaEventCreateWithFlagsInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaEventCreateWithFlags", &decls->cusan_event_create.f);
+}
+llvm::SmallVector<Value*> CudaEventCreateWithFlagsInstrumenter::map_arguments(IRBuilder<>& irb,
+                                                                              llvm::ArrayRef<Value*> args) {
+  // cudaEvent_t* event, unsigned int  flags
+  assert(args.size() == 2);
+  auto* cu_event_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {cu_event_void_ptr_ptr};
+}
+
+// StreamCreateInstrumenter
+
+StreamCreateInstrumenter::StreamCreateInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaStreamCreate", &decls->cusan_stream_create.f);
+}
+llvm::SmallVector<Value*> StreamCreateInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 1);
+  auto* flags                  = llvm::ConstantInt::get(Type::getInt32Ty(irb.getContext()), 0, false);
+  auto* cu_stream_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {cu_stream_void_ptr_ptr, flags};
+}
+
+// StreamCreateWithFlagsInstrumenter
+
+StreamCreateWithFlagsInstrumenter::StreamCreateWithFlagsInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaStreamCreateWithFlags", &decls->cusan_stream_create.f);
+}
+
+llvm::SmallVector<Value*> StreamCreateWithFlagsInstrumenter::map_arguments(IRBuilder<>& irb,
+                                                                           llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 2);
+  auto* cu_stream_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* flags                  = args[1];
+  return {cu_stream_void_ptr_ptr, flags};
+}
+
+// StreamCreateWithPriorityInstrumenter
+
+StreamCreateWithPriorityInstrumenter::StreamCreateWithPriorityInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaStreamCreateWithPriority", &decls->cusan_stream_create.f);
+}
+
+llvm::SmallVector<Value*> StreamCreateWithPriorityInstrumenter::map_arguments(IRBuilder<>& irb,
+                                                                              llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 3);
+  auto* cu_stream_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* flags                  = args[1];
+  return {cu_stream_void_ptr_ptr, flags};
+}
+
+// StreamWaitEventInstrumenter
+
+StreamWaitEventInstrumenter::StreamWaitEventInstrumenter(callback::FunctionDecl* decls) {
+  setup("cudaStreamWaitEvent", &decls->cusan_stream_wait_event.f);
+}
+llvm::SmallVector<Value*> StreamWaitEventInstrumenter::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  assert(args.size() == 3);
+  // auto* cu_stream_void_ptr = irb.CreateLoad(get_void_ptr_type(irb), args[0], "");
+  auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[1], get_void_ptr_type(irb));
+  return {cu_stream_void_ptr, cu_event_void_ptr, args[2]};
+}
+
+// CudaHostRegister
+
+CudaHostRegister::CudaHostRegister(callback::FunctionDecl* decls) {
+  setup("cudaHostRegister", &decls->cusan_host_register.f);
+}
+llvm::SmallVector<Value*> CudaHostRegister::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr)
+  assert(args.size() == 3);
+  auto* ptr   = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* size  = args[1];
+  auto* flags = args[2];
+  return {ptr, size, flags};
+}
+
+// CudaHostUnregister
+
+CudaHostUnregister::CudaHostUnregister(callback::FunctionDecl* decls) {
+  setup("cudaHostUnregister", &decls->cusan_host_unregister.f);
+}
+llvm::SmallVector<Value*> CudaHostUnregister::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr)
+  assert(args.size() == 1);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {ptr};
+}
+
+// CudaHostFree
+
+CudaHostFree::CudaHostFree(callback::FunctionDecl* decls) {
+  setup("cudaFreeHost", &decls->cusan_host_free.f);
+}
+llvm::SmallVector<Value*> CudaHostFree::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr)
+  assert(args.size() == 1);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {ptr};
+}
+
+// CudaMallocManaged
+
+CudaMallocManaged::CudaMallocManaged(callback::FunctionDecl* decls) {
+  setup("cudaMallocManaged", &decls->cusan_managed_alloc.f);
+}
+llvm::SmallVector<Value*> CudaMallocManaged::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr, size_t size, u32 flags)
+  assert(args.size() == 3);
+  auto* ptr   = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* size  = args[1];
+  auto* flags = args[2];
+  return {ptr, size, flags};
+}
+
+// CudaMalloc
+
+CudaMalloc::CudaMalloc(callback::FunctionDecl* decls) {
+  setup("cudaMalloc", &decls->cusan_device_alloc.f);
+}
+llvm::SmallVector<Value*> CudaMalloc::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr, size_t size)
+  assert(args.size() == 2);
+  auto* ptr  = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  auto* size = args[1];
+  return {ptr, size};
+}
+
+// CudaFree
+
+CudaFree::CudaFree(callback::FunctionDecl* decls) {
+  setup("cudaFree", &decls->cusan_device_free.f);
+}
+llvm::SmallVector<Value*> CudaFree::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* ptr)
+  assert(args.size() == 1);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {ptr};
+}
+
+// CudaMallocPitch
+
+CudaMallocPitch::CudaMallocPitch(callback::FunctionDecl* decls) {
+  setup("cudaMallocPitch", &decls->cusan_device_alloc.f);
+}
+llvm::SmallVector<Value*> CudaMallocPitch::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //(void** devPtr, size_t* pitch, size_t width, size_t height )
+  assert(args.size() == 4);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+
+  //"The function may pad the allocation"
+  //"*pitch by cudaMallocPitch() is the width in bytes of the allocation"
+  auto* pitch = irb.CreateLoad(irb.getIntPtrTy(irb.GetInsertBlock()->getModule()->getDataLayout()), args[1]);
+  // auto* width = args[2];
+  auto* height = args[3];
+
+  auto* real_size = irb.CreateMul(pitch, height);
+  return {ptr, real_size};
+}
+
+// CudaStreamQuery
+
+CudaStreamQuery::CudaStreamQuery(callback::FunctionDecl* decls) {
+  setup("cudaStreamQuery", &decls->cusan_stream_query.f);
+}
+llvm::SmallVector<Value*> CudaStreamQuery::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* stream)
+  assert(args.size() == 1);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {ptr};
+}
+llvm::SmallVector<Value*, 1> CudaStreamQuery::map_return_value(IRBuilder<>& irb, Value* result) {
+  (void)irb;
+  return {result};
+}
+
+// CudaEventQuery
+
+CudaEventQuery::CudaEventQuery(callback::FunctionDecl* decls) {
+  setup("cudaEventQuery", &decls->cusan_event_query.f);
+}
+llvm::SmallVector<Value*> CudaEventQuery::map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
+  //( void* event)
+  assert(args.size() == 1);
+  auto* ptr = irb.CreateBitOrPointerCast(args[0], get_void_ptr_type(irb));
+  return {ptr};
+}
+llvm::SmallVector<Value*, 1> CudaEventQuery::map_return_value(IRBuilder<>& irb, Value* result) {
+  (void)irb;
+  return {result};
+}
+
+}  // namespace cusan::transform
diff --git a/lib/pass/AnalysisTransform.h b/lib/pass/AnalysisTransform.h
index a3852e8..97c1bc1 100644
--- a/lib/pass/AnalysisTransform.h
+++ b/lib/pass/AnalysisTransform.h
@@ -10,11 +10,11 @@
 #include "FunctionDecl.h"
 #include "analysis/KernelAnalysis.h"
 
+#include <llvm/Demangle/Demangle.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
 #include <llvm/IR/InstIterator.h>
 #include <llvm/IR/Instructions.h>
-#include <utility>
 
 using namespace llvm;
 namespace cusan {
@@ -23,10 +23,23 @@ namespace analysis {
 
 using KernelArgInfo = cusan::FunctionArg;
 
+namespace helper {
+template <typename... Strings>
+bool ends_with_any_of(const std::string& name, Strings&&... searching_names) {
+  const llvm::StringRef name_ref{name};
+#if LLVM_VERSION_MAJOR > 15
+  return (name_ref.ends_with(searching_names) || ...);
+#else
+  return (name_ref.endswith(searching_names) || ...);
+#endif
+}
+
+}  // namespace helper
+
 struct CudaKernelInvokeCollector {
   KernelModel& model;
   struct KernelInvokeData {
-    llvm::SmallVector<KernelArgInfo, 4> args{};
+    llvm::SmallVector<KernelArgInfo, 4> args;
     llvm::Value* void_arg_array{nullptr};
     llvm::Value* cu_stream{nullptr};
   };
@@ -35,62 +48,9 @@ struct CudaKernelInvokeCollector {
   CudaKernelInvokeCollector(KernelModel& current_stub_model) : model(current_stub_model) {
   }
 
-  llvm::Optional<KernelInvokeData> match(llvm::CallBase& cb, Function& callee) const {
-    if (callee.getName() == "cudaLaunchKernel") {
-      errs() << "Func:" << callee.getFunction() << "\n";
-      auto* cu_stream_handle      = std::prev(cb.arg_end())->get();
-      auto* void_kernel_arg_array = std::prev(cb.arg_end(), 3)->get();
-      // auto* cb_parent_function    = cb.getFunction();
-      auto kernel_args = extract_kernel_args_for(void_kernel_arg_array);
-
-      return KernelInvokeData{kernel_args, void_kernel_arg_array, cu_stream_handle};
-    }
-    return llvm::NoneType();
-  }
-
-  llvm::SmallVector<KernelArgInfo, 4> extract_kernel_args_for(llvm::Value* void_kernel_arg_array) const {
-    unsigned index = 0;
-
-    llvm::SmallVector<Value*, 4> real_args;
-
-    for (auto* array_user : void_kernel_arg_array->users()) {
-      if (auto* gep = dyn_cast<GetElementPtrInst>(array_user)) {
-        for (auto* gep_user : gep->users()) {
-          if (auto* store = dyn_cast<StoreInst>(gep_user)) {
-            assert(index < model.args.size());
-            if (auto* cast = dyn_cast<BitCastInst>(store->getValueOperand())) {
-              real_args.push_back(*cast->operand_values().begin());
-            } else {
-              assert(false);
-            }
-            index++;
-          }
-        }
-      }
-    }
+  std::optional<KernelInvokeData> match(llvm::CallBase& cb, Function& callee) const;
 
-    llvm::SmallVector<KernelArgInfo, 4> result = model.args;
-    for (auto& res : result) {
-      Value* val = real_args[real_args.size() - 1 - res.arg_pos];
-      // because of ABI? clang might convert struct argument to a (byval)pointer
-      // but the actual cuda argument is by value so we double check if the expected type matches the actual type
-      // and only if then we load it. I think this should handle all cases since the only case it would fail
-      // is if we do strct* and send that (byval)pointer but that shouldn't be a thing?
-      bool real_ptr =
-          res.is_pointer &&
-          (dyn_cast<PointerType>(dyn_cast<PointerType>(val->getType())->getPointerElementType()) != nullptr);
-
-      // not fake pointer from clang so load it before getting subargs
-      for (auto& sub_arg : res.subargs) {
-        if (real_ptr) {
-          sub_arg.indices.insert(sub_arg.indices.begin(), -1);
-        }
-        sub_arg.value = val;
-      }
-      res.value = val;
-    }
-    return result;
-  }
+  llvm::SmallVector<KernelArgInfo, 4> extract_kernel_args_for(llvm::Value* void_kernel_arg_array) const;
 };
 
 }  // namespace analysis
@@ -103,94 +63,13 @@ struct KernelInvokeTransformer {
   KernelInvokeTransformer(callback::FunctionDecl* decls) : decls_(decls) {
   }
 
-  bool transform(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) const {
-    using namespace llvm;
-    return generate_compound_cb(data, irb);
-  }
+  bool transform(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) const;
 
  private:
-  static short access_cast(AccessState access, bool is_ptr) {
-    auto value = static_cast<short>(access);
-    value <<= 1;
-    if (is_ptr) {
-      value |= 1;
-    }
-    return value;
-  }
+  static short access_cast(AccessState access, bool is_ptr);
 
-  static llvm::Value* get_cu_stream_ptr(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) {
-    auto* cu_stream = data.cu_stream;
-    assert(cu_stream != nullptr && "Require cuda stream!");
-    auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(cu_stream, irb.getInt8PtrTy());
-    return cu_stream_void_ptr;
-  }
-
-  bool generate_compound_cb(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) const {
-    const bool should_transform =
-        llvm::count_if(data.args, [&](const auto& elem) {
-          return llvm::count_if(elem.subargs, [&](const auto& sub_elem) { return sub_elem.is_pointer; }) > 0;
-        }) > 0;
-
-    uint32_t n_subargs = 0;
-    for (const auto& arg : data.args) {
-      n_subargs += arg.subargs.size();
-    }
-
-    if (!should_transform) {
-      return false;
-    }
-
-    auto target_callback = decls_->cusan_register_access;
-
-    auto* i16_ty      = Type::getInt16Ty(irb.getContext());
-    auto* i32_ty      = Type::getInt32Ty(irb.getContext());
-    auto* void_ptr_ty = Type::getInt8PtrTy(irb.getContext());
-    // auto* void_ptr_ptr_ty = Type::getInt8PtrTy(irb.getContext())->getPointerTo();
-
-    auto* cu_stream_void_ptr = get_cu_stream_ptr(data, irb);
-    auto* arg_size           = irb.getInt32(n_subargs);
-    auto* arg_access_array   = irb.CreateAlloca(i16_ty, arg_size);
-    auto* arg_value_array    = irb.CreateAlloca(void_ptr_ty, arg_size);
-
-    size_t arg_array_index = 0;
-    for (const auto& arg : data.args) {
-      errs() << "Handling Arg: " << arg << "\n";
-      for (const auto& sub_arg : arg.subargs) {
-        errs() << "   subarg: " << sub_arg << "\n";
-        const auto access = access_cast(sub_arg.state, sub_arg.is_pointer);
-        Value* idx        = ConstantInt::get(i32_ty, arg_array_index);
-        Value* acc        = ConstantInt::get(i16_ty, access);
-        auto* gep_acc     = irb.CreateGEP(i16_ty, arg_access_array, idx);
-        irb.CreateStore(acc, gep_acc);
-        // only if it is a pointer store the actual pointer in the value array
-        if (sub_arg.is_pointer) {
-          assert(arg.value.hasValue());
-
-          auto* value_ptr = arg.value.getValue();
-
-          // TODO: parts of a struct might be null if they are only executed conditionally so we should check the parent
-          // for null before gep/load
-          for (auto gep_index : sub_arg.indices) {
-            auto* subtype = dyn_cast<PointerType>(value_ptr->getType())->getPointerElementType();
-            if (gep_index == -1) {
-              value_ptr = irb.CreateLoad(subtype, value_ptr);
-            } else {
-              value_ptr = irb.CreateStructGEP(subtype, value_ptr, gep_index);
-            }
-          }
-
-          auto* voided_ptr    = irb.CreatePointerCast(value_ptr, void_ptr_ty);
-          auto* gep_val_array = irb.CreateGEP(void_ptr_ty, arg_value_array, idx);
-          irb.CreateStore(voided_ptr, gep_val_array);
-          arg_array_index += 1;
-        }
-      }
-    }
-
-    Value* args_cusan_register[] = {arg_value_array, arg_access_array, arg_size, cu_stream_void_ptr};
-    irb.CreateCall(target_callback.f, args_cusan_register);
-    return true;
-  }
+  static llvm::Value* get_cu_stream_ptr(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb);
+  bool generate_compound_cb(const analysis::CudaKernelInvokeCollector::Data& data, IRBuilder<>& irb) const;
 };
 
 template <class Collector, class Transformer>
@@ -213,8 +92,8 @@ class CallInstrumenter {
       if (auto* cb = dyn_cast<CallBase>(&I)) {
         if (auto* f = cb->getCalledFunction()) {
           auto t = collector_.match(*cb, *f);
-          if (t.hasValue()) {
-            data_vec_.push_back({t.getValue(), cb});
+          if (t.has_value()) {
+            data_vec_.push_back({t.value(), cb});
           }
         }
       }
@@ -296,323 +175,56 @@ class SimpleInstrumenter {
   }
 };
 
-class DeviceSyncInstrumenter : public SimpleInstrumenter<DeviceSyncInstrumenter> {
- public:
-  DeviceSyncInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaDeviceSynchronize", &decls->cusan_sync_device.f);
-  }
-  static llvm::SmallVector<Value*, 4> map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*>) {
-    return {};
-  }
-};
-class StreamSyncInstrumenter : public SimpleInstrumenter<StreamSyncInstrumenter> {
- public:
-  StreamSyncInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaStreamSynchronize", &decls->cusan_sync_stream.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 1);
-    Value* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {cu_stream_void_ptr};
-  }
-};
-class EventSyncInstrumenter : public SimpleInstrumenter<EventSyncInstrumenter> {
- public:
-  EventSyncInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaEventSynchronize", &decls->cusan_sync_event.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 1);
-    auto* cu_event_void_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {cu_event_void_ptr};
-  }
-};
-class EventRecordInstrumenter : public SimpleInstrumenter<EventRecordInstrumenter> {
- public:
-  EventRecordInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaEventRecord", &decls->cusan_event_record.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 2);
-    auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[1], irb.getInt8PtrTy());
-    return {cu_event_void_ptr, cu_stream_void_ptr};
-  }
-};
-class EventRecordFlagsInstrumenter : public SimpleInstrumenter<EventRecordFlagsInstrumenter> {
- public:
-  EventRecordFlagsInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaEventRecordWithFlags", &decls->cusan_event_record.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 3);
-    auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[1], irb.getInt8PtrTy());
-    return {cu_event_void_ptr, cu_stream_void_ptr};
-  }
-};
-
-class MemcpyAsyncInstrumenter : public SimpleInstrumenter<MemcpyAsyncInstrumenter> {
- public:
-  MemcpyAsyncInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaMemcpyAsync", &decls->cusan_memcpy_async.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    // void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream = 0
-    assert(args.size() == 5);
-    auto* dst_ptr   = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* src_ptr   = irb.CreateBitOrPointerCast(args[1], irb.getInt8PtrTy());
-    auto* count     = args[2];
-    auto* kind      = args[3];
-    auto* cu_stream = irb.CreateBitOrPointerCast(args[4], irb.getInt8PtrTy());
-    return {dst_ptr, src_ptr, count, kind, cu_stream};
-  }
-};
-
-class CudaMemcpyInstrumenter : public SimpleInstrumenter<CudaMemcpyInstrumenter> {
- public:
-  CudaMemcpyInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaMemcpy", &decls->cusan_memcpy.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    // void* dst, const void* src, size_t count, cudaMemcpyKind kind
-    assert(args.size() == 4);
-    auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* src_ptr = irb.CreateBitOrPointerCast(args[1], irb.getInt8PtrTy());
-    auto* count   = args[2];
-    auto* kind    = args[3];
-    return {dst_ptr, src_ptr, count, kind};
-  }
-};
-
-class MemsetAsyncInstrumenter : public SimpleInstrumenter<MemsetAsyncInstrumenter> {
- public:
-  MemsetAsyncInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaMemsetAsync", &decls->cusan_memset_async.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* devPtr, int  value, size_t count, cudaStream_t stream = 0 )
-    assert(args.size() == 4);
-    auto* dst_ptr   = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* value     = args[1];
-    auto* count     = args[2];
-    auto* cu_stream = irb.CreateBitOrPointerCast(args[3], irb.getInt8PtrTy());
-    return {dst_ptr, value, count, cu_stream};
-  }
-};
-class CudaMemsetInstrumenter : public SimpleInstrumenter<CudaMemsetInstrumenter> {
- public:
-  CudaMemsetInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaMemset", &decls->cusan_memset.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* devPtr, int  value, size_t count,)
-    assert(args.size() == 3);
-    auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* value   = args[1];
-    auto* count   = args[2];
-    return {dst_ptr, value, count};
-  }
-};
-
-class CudaHostAlloc : public SimpleInstrumenter<CudaHostAlloc> {
- public:
-  CudaHostAlloc(callback::FunctionDecl* decls) {
-    setup("cudaHostAlloc", &decls->cusan_host_alloc.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void** ptr, size_t size, unsigned int flags )
-    assert(args.size() == 3);
-    auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* size    = args[1];
-    auto* flags   = args[2];
-    return {dst_ptr, size, flags};
-  }
-};
-
-class CudaMallocHost : public SimpleInstrumenter<CudaMallocHost> {
- public:
-  CudaMallocHost(callback::FunctionDecl* decls) {
-    setup("cudaMallocHost", &decls->cusan_host_alloc.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void** ptr, size_t size)
-    assert(args.size() == 2);
-    auto* dst_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* size    = args[1];
-    auto* flags   = llvm::ConstantInt::get(Type::getInt32Ty(irb.getContext()), 0, false);
-    return {dst_ptr, size, flags};
-  }
-};
-
-class EventCreateInstrumenter : public SimpleInstrumenter<EventCreateInstrumenter> {
- public:
-  EventCreateInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaEventCreate", &decls->cusan_event_create.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 1);
-    // auto* cu_event_void_ptr = irb.CreateLoad(irb.getInt8PtrTy(), args[0], "");
-    auto* cu_event_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {cu_event_void_ptr_ptr};
-  }
-};
-
-class StreamCreateInstrumenter : public SimpleInstrumenter<StreamCreateInstrumenter> {
- public:
-  StreamCreateInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaStreamCreate", &decls->cusan_stream_create.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 1);
-    auto* flags                  = llvm::ConstantInt::get(Type::getInt32Ty(irb.getContext()), 0, false);
-    auto* cu_stream_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {cu_stream_void_ptr_ptr, flags};
-  }
-};
-
-class StreamCreateWithFlagsInstrumenter : public SimpleInstrumenter<StreamCreateWithFlagsInstrumenter> {
- public:
-  StreamCreateWithFlagsInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaStreamCreateWithFlags", &decls->cusan_stream_create.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 2);
-    auto* cu_stream_void_ptr_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* flags                  = args[1];
-    return {cu_stream_void_ptr_ptr, flags};
-  }
-};
-
-class StreamWaitEventInstrumenter : public SimpleInstrumenter<StreamWaitEventInstrumenter> {
- public:
-  StreamWaitEventInstrumenter(callback::FunctionDecl* decls) {
-    setup("cudaStreamWaitEvent", &decls->cusan_stream_wait_event.f);
-  }
-  static llvm::SmallVector<Value*, 1> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    assert(args.size() == 3);
-    // auto* cu_stream_void_ptr = irb.CreateLoad(irb.getInt8PtrTy(), args[0], "");
-    auto* cu_stream_void_ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* cu_event_void_ptr  = irb.CreateBitOrPointerCast(args[1], irb.getInt8PtrTy());
-    return {cu_stream_void_ptr, cu_event_void_ptr, args[2]};
-  }
-};
-
-class CudaHostRegister : public SimpleInstrumenter<CudaHostRegister> {
- public:
-  CudaHostRegister(callback::FunctionDecl* decls) {
-    setup("cudaHostRegister", &decls->cusan_host_register.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr)
-    assert(args.size() == 3);
-    auto* ptr   = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* size  = args[1];
-    auto* flags = args[2];
-    return {ptr, size, flags};
-  }
-};
-
-class CudaHostUnregister : public SimpleInstrumenter<CudaHostUnregister> {
- public:
-  CudaHostUnregister(callback::FunctionDecl* decls) {
-    setup("cudaHostUnregister", &decls->cusan_host_unregister.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr)
-    assert(args.size() == 1);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {ptr};
-  }
-};
-
-class CudaHostFree : public SimpleInstrumenter<CudaHostFree> {
- public:
-  CudaHostFree(callback::FunctionDecl* decls) {
-    setup("cudaFreeHost", &decls->cusan_host_free.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr)
-    assert(args.size() == 1);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {ptr};
-  }
-};
-
-class CudaMallocManaged : public SimpleInstrumenter<CudaMallocManaged> {
- public:
-  CudaMallocManaged(callback::FunctionDecl* decls) {
-    setup("cudaMallocManaged", &decls->cusan_managed_alloc.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr, size_t size, u32 flags)
-    assert(args.size() == 3);
-    auto* ptr   = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* size  = args[1];
-    auto* flags = args[2];
-    return {ptr, size, flags};
-  }
-};
+#ifndef BasicInstrumenterDecl
+#define BasicInstrumenterDecl(name)                                                       \
+  class name : public SimpleInstrumenter<name> {                                          \
+   public:                                                                                \
+    name(callback::FunctionDecl* decls);                                                  \
+    static llvm::SmallVector<Value*> map_arguments(IRBuilder<>&, llvm::ArrayRef<Value*>); \
+  };
+#endif
 
-class CudaMalloc : public SimpleInstrumenter<CudaMalloc> {
- public:
-  CudaMalloc(callback::FunctionDecl* decls) {
-    setup("cudaMalloc", &decls->cusan_device_alloc.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr, size_t size)
-    assert(args.size() == 2);
-    auto* ptr  = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    auto* size = args[1];
-    return {ptr, size};
-  }
-};
-
-class CudaFree : public SimpleInstrumenter<CudaFree> {
- public:
-  CudaFree(callback::FunctionDecl* decls) {
-    setup("cudaFree", &decls->cusan_device_free.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* ptr)
-    assert(args.size() == 1);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {ptr};
-  }
-};
+BasicInstrumenterDecl(DeviceSyncInstrumenter);
+BasicInstrumenterDecl(StreamSyncInstrumenter);
+BasicInstrumenterDecl(EventSyncInstrumenter);
+BasicInstrumenterDecl(EventRecordInstrumenter);
+BasicInstrumenterDecl(EventRecordFlagsInstrumenter);
+BasicInstrumenterDecl(CudaMemcpyAsyncInstrumenter);
+BasicInstrumenterDecl(CudaMemcpyInstrumenter);
+BasicInstrumenterDecl(CudaMemcpy2DInstrumenter);
+BasicInstrumenterDecl(CudaMemcpy2DAsyncInstrumenter);
+BasicInstrumenterDecl(CudaMemsetAsyncInstrumenter);
+BasicInstrumenterDecl(CudaMemsetInstrumenter);
+BasicInstrumenterDecl(CudaMemset2dAsyncInstrumenter);
+BasicInstrumenterDecl(CudaMemset2dInstrumenter);
+BasicInstrumenterDecl(CudaHostAlloc);
+BasicInstrumenterDecl(CudaMallocHost);
+BasicInstrumenterDecl(CudaEventCreateInstrumenter);
+BasicInstrumenterDecl(CudaEventCreateWithFlagsInstrumenter);
+BasicInstrumenterDecl(StreamCreateInstrumenter);
+BasicInstrumenterDecl(StreamCreateWithFlagsInstrumenter);
+BasicInstrumenterDecl(StreamCreateWithPriorityInstrumenter);
+BasicInstrumenterDecl(StreamWaitEventInstrumenter);
+BasicInstrumenterDecl(CudaHostRegister);
+BasicInstrumenterDecl(CudaHostUnregister);
+BasicInstrumenterDecl(CudaHostFree);
+BasicInstrumenterDecl(CudaMallocManaged);
+BasicInstrumenterDecl(CudaMalloc);
+BasicInstrumenterDecl(CudaFree);
+BasicInstrumenterDecl(CudaMallocPitch);
 
 class CudaStreamQuery : public SimpleInstrumenter<CudaStreamQuery> {
  public:
-  CudaStreamQuery(callback::FunctionDecl* decls) {
-    setup("cudaStreamQuery", &decls->cusan_stream_query.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* stream)
-    assert(args.size() == 1);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {ptr};
-  }
-  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result) {
-    (void)irb;
-    return {result};
-  }
+  CudaStreamQuery(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
 };
 
 class CudaEventQuery : public SimpleInstrumenter<CudaEventQuery> {
  public:
-  CudaEventQuery(callback::FunctionDecl* decls) {
-    setup("cudaEventQuery", &decls->cusan_event_query.f);
-  }
-  static llvm::SmallVector<Value*, 2> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args) {
-    //( void* event)
-    assert(args.size() == 1);
-    auto* ptr = irb.CreateBitOrPointerCast(args[0], irb.getInt8PtrTy());
-    return {ptr};
-  }
-  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result) {
-    (void)irb;
-    return {result};
-  }
+  CudaEventQuery(callback::FunctionDecl* decls);
+  static llvm::SmallVector<Value*> map_arguments(IRBuilder<>& irb, llvm::ArrayRef<Value*> args);
+  static llvm::SmallVector<Value*, 1> map_return_value(IRBuilder<>& irb, Value* result);
 };
 
 }  // namespace transform
diff --git a/lib/pass/CMakeLists.txt b/lib/pass/CMakeLists.txt
index 4dc8e48..f9dce7f 100644
--- a/lib/pass/CMakeLists.txt
+++ b/lib/pass/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(PASS_SOURCES
   CusanPass.cpp
+  AnalysisTransform.cpp
+  FunctionDecl.cpp
 )
 
 cusan_llvm_module(cusan_TransformPass
@@ -20,6 +22,10 @@ add_library(cusan::TransformPass ALIAS cusan_TransformPass)
 
 target_compile_features(cusan_TransformPass PUBLIC cxx_std_17)
 
+target_compile_definitions(cusan_TransformPass PRIVATE 
+  LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
+  CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_PASS})
+
 target_include_directories(cusan_TransformPass ${warning_guard}
   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
           ${PROJECT_SOURCE_DIR}/lib/
diff --git a/lib/pass/CusanPass.cpp b/lib/pass/CusanPass.cpp
index 5c78e6a..00bc33c 100644
--- a/lib/pass/CusanPass.cpp
+++ b/lib/pass/CusanPass.cpp
@@ -4,9 +4,10 @@
 // (See accompanying file LICENSE)
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "CusanPass.h"
+
 #include "AnalysisTransform.h"
 #include "CommandLine.h"
-#include "CusanPass.h"
 #include "FunctionDecl.h"
 #include "analysis/KernelAnalysis.h"
 #include "support/CudaUtil.h"
@@ -57,7 +58,7 @@ class LegacyCusanPass : public llvm::ModulePass {
  public:
   static char ID;  // NOLINT
 
-  LegacyCusanPass() : ModulePass(ID){};
+  LegacyCusanPass() : ModulePass(ID) {};
 
   bool runOnModule(llvm::Module& module) override;
 
@@ -131,34 +132,45 @@ bool CusanPass::runOnFunc(llvm::Function& function) {
   }
 
   bool modified = false;
-  transform::DeviceSyncInstrumenter(&cusan_decls_).instrument(function);
-  transform::StreamSyncInstrumenter(&cusan_decls_).instrument(function);
-  transform::EventSyncInstrumenter(&cusan_decls_).instrument(function);
-  transform::EventRecordInstrumenter(&cusan_decls_).instrument(function);
-  transform::EventRecordFlagsInstrumenter(&cusan_decls_).instrument(function);
-  transform::EventCreateInstrumenter(&cusan_decls_).instrument(function);
-  transform::StreamCreateInstrumenter(&cusan_decls_).instrument(function);
-  transform::MemsetAsyncInstrumenter(&cusan_decls_).instrument(function);
-  transform::MemcpyAsyncInstrumenter(&cusan_decls_).instrument(function);
-  transform::CudaMemsetInstrumenter(&cusan_decls_).instrument(function);
-  transform::CudaMemcpyInstrumenter(&cusan_decls_).instrument(function);
-  transform::StreamWaitEventInstrumenter(&cusan_decls_).instrument(function);
-  transform::CudaMallocHost(&cusan_decls_).instrument(function);
-  transform::CudaHostAlloc(&cusan_decls_).instrument(function);
-  transform::CudaHostFree(&cusan_decls_).instrument(function);
-  transform::CudaHostRegister(&cusan_decls_).instrument(function);
-  transform::CudaHostUnregister(&cusan_decls_).instrument(function);
-  transform::CudaMallocManaged(&cusan_decls_).instrument(function);
-  transform::CudaMalloc(&cusan_decls_).instrument(function);
-  transform::CudaFree(&cusan_decls_).instrument(function);
-  transform::CudaStreamQuery(&cusan_decls_).instrument(function);
-  transform::CudaEventQuery(&cusan_decls_).instrument(function);
-  transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function);
+
+  modified |= transform::DeviceSyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::StreamSyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::EventSyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::EventRecordInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::EventRecordFlagsInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaEventCreateInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaEventCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::StreamCreateInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemset2dAsyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemsetAsyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemcpyAsyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemset2dInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemsetInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemcpyInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemcpy2DInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMemcpy2DAsyncInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::StreamWaitEventInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMallocHost(&cusan_decls_).instrument(function);
+  modified |= transform::CudaHostAlloc(&cusan_decls_).instrument(function);
+  modified |= transform::CudaHostFree(&cusan_decls_).instrument(function);
+  modified |= transform::CudaHostRegister(&cusan_decls_).instrument(function);
+  modified |= transform::CudaHostUnregister(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMallocManaged(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMalloc(&cusan_decls_).instrument(function);
+  modified |= transform::CudaFree(&cusan_decls_).instrument(function);
+  modified |= transform::CudaStreamQuery(&cusan_decls_).instrument(function);
+  modified |= transform::CudaEventQuery(&cusan_decls_).instrument(function);
+  modified |= transform::StreamCreateWithFlagsInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::StreamCreateWithPriorityInstrumenter(&cusan_decls_).instrument(function);
+  modified |= transform::CudaMallocPitch(&cusan_decls_).instrument(function);
+
   auto data_for_host = host::kernel_model_for_stub(&function, this->kernel_models_);
   if (data_for_host) {
-    transform::CallInstrumenter(analysis::CudaKernelInvokeCollector{data_for_host.value()},
-                                transform::KernelInvokeTransformer{&cusan_decls_}, function)
-        .instrument();
+    LOG_FATAL("Found kernel data for " << util::try_demangle_fully(function) << ": "
+                                       << data_for_host.value().kernel_name)
+    modified |= transform::CallInstrumenter(analysis::CudaKernelInvokeCollector{data_for_host.value()},
+                                            transform::KernelInvokeTransformer{&cusan_decls_}, function)
+                    .instrument();
   }
   return modified;
 }
diff --git a/lib/pass/FunctionDecl.cpp b/lib/pass/FunctionDecl.cpp
new file mode 100644
index 0000000..f58c828
--- /dev/null
+++ b/lib/pass/FunctionDecl.cpp
@@ -0,0 +1,129 @@
+#include "FunctionDecl.h"
+
+namespace cusan::callback {
+
+void FunctionDecl::initialize(llvm::Module& module) {
+  using namespace llvm;
+  auto& c = module.getContext();
+
+  const auto add_optimizer_attributes = [&](auto& arg) {
+    arg.addAttr(Attribute::NoCapture);
+    arg.addAttr(Attribute::ReadOnly);
+  };
+
+  const auto make_function = [&](auto& f_struct, auto f_types) {
+    auto func_type     = f_types.empty() ? FunctionType::get(Type::getVoidTy(c), false)
+                                         : FunctionType::get(Type::getVoidTy(c), f_types, false);
+    auto func_callee   = module.getOrInsertFunction(f_struct.name, func_type);
+    f_struct.f         = func_callee;
+    f_struct.arg_types = std::move(f_types);
+    if (auto f = dyn_cast<Function>(f_struct.f.getCallee())) {
+      f->setLinkage(GlobalValue::ExternalLinkage);
+      if (f->arg_size() == 0) {
+        return;
+      }
+      auto& first_param = *(f->arg_begin());
+      if (first_param.getType()->isPointerTy()) {
+        add_optimizer_attributes(first_param);
+      }
+    }
+  };
+
+  auto* void_ptr  = Type::getInt8Ty(c)->getPointerTo();
+  auto* int16_ptr = Type::getInt16Ty(c)->getPointerTo();
+
+  using ArgTypes = decltype(CusanFunction::arg_types);
+
+  ArgTypes arg_types_cusan_register = {PointerType::get(void_ptr, 0), int16_ptr, Type::getInt32Ty(c), void_ptr};
+  make_function(cusan_register_access, arg_types_cusan_register);
+
+  ArgTypes arg_types_sync_device = {};
+  make_function(cusan_sync_device, arg_types_sync_device);
+
+  ArgTypes arg_types_sync_stream = {void_ptr};
+  make_function(cusan_sync_stream, arg_types_sync_stream);
+
+  ArgTypes arg_types_sync_event = {void_ptr};
+  make_function(cusan_sync_event, arg_types_sync_event);
+  ArgTypes arg_types_event_record = {void_ptr, void_ptr};
+  make_function(cusan_event_record, arg_types_event_record);
+
+  ArgTypes arg_types_event_create = {void_ptr};
+  make_function(cusan_event_create, arg_types_event_create);
+
+  ArgTypes arg_types_stream_create = {void_ptr, Type::getInt32Ty(c)};
+  make_function(cusan_stream_create, arg_types_stream_create);
+
+  auto* size_t_ty = module.getDataLayout().getIntPtrType(c);
+
+  // void* devPtr, size_t count, RawStream* stream
+  ArgTypes arg_types_memset_async = {void_ptr, size_t_ty, void_ptr};
+  make_function(cusan_memset_async, arg_types_memset_async);
+
+  // void* dst, const void* src
+  ArgTypes arg_types_memcpy_async = {void_ptr, void_ptr,
+                                     // size_t count, MemcpyKind kind, RawStream stream
+                                     size_t_ty, Type::getInt32Ty(c), void_ptr};
+  make_function(cusan_memcpy_async, arg_types_memcpy_async);
+
+  // void* devPtr, size_t count
+  ArgTypes arg_types_memset = {void_ptr, size_t_ty};
+  make_function(cusan_memset, arg_types_memset);
+
+  // void* dst, const void* src
+  ArgTypes arg_types_memcpy = {void_ptr, void_ptr,
+                               // size_t count, MemcpyKind kind
+                               size_t_ty, Type::getInt32Ty(c)};
+  make_function(cusan_memcpy, arg_types_memcpy);
+
+  ArgTypes arg_types_stream_wait_event = {void_ptr, void_ptr, Type::getInt32Ty(c)};
+  make_function(cusan_stream_wait_event, arg_types_stream_wait_event);
+
+  ArgTypes arg_types_host_alloc = {void_ptr, size_t_ty, Type::getInt32Ty(c)};
+  make_function(cusan_host_alloc, arg_types_host_alloc);
+
+  ArgTypes arg_types_host_register = {void_ptr, size_t_ty, Type::getInt32Ty(c)};
+  make_function(cusan_host_register, arg_types_host_register);
+
+  ArgTypes arg_types_host_unregister = {void_ptr};
+  make_function(cusan_host_unregister, arg_types_host_unregister);
+
+  ArgTypes arg_types_host_free = {void_ptr};
+  make_function(cusan_host_free, arg_types_host_free);
+
+  ArgTypes arg_types_managed_alloc = {void_ptr, size_t_ty, Type::getInt32Ty(c)};
+  make_function(cusan_managed_alloc, arg_types_managed_alloc);
+
+  ArgTypes arg_device_alloc = {void_ptr, size_t_ty};
+  make_function(cusan_device_alloc, arg_device_alloc);
+
+  ArgTypes arg_device_free = {void_ptr};
+  make_function(cusan_device_free, arg_device_free);
+
+  // RawStream stream, u32 return_errType
+  ArgTypes arg_stream_query = {void_ptr, Type::getInt32Ty(c)};
+  make_function(cusan_stream_query, arg_stream_query);
+
+  // Event stream, u32 return_errType
+  ArgTypes arg_event_query = {void_ptr, Type::getInt32Ty(c)};
+  make_function(cusan_event_query, arg_event_query);
+
+  // void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, cusan_MemcpyKind kind
+  ArgTypes arg_types_memcpy_2d = {void_ptr, size_t_ty, void_ptr, size_t_ty, size_t_ty, size_t_ty, Type::getInt32Ty(c)};
+  make_function(cusan_memcpy_2d, arg_types_memcpy_2d);
+
+  // void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height, cusan_MemcpyKind kind
+  ArgTypes arg_types_memcpy_2d_async = {void_ptr,  size_t_ty,           void_ptr, size_t_ty, size_t_ty,
+                                        size_t_ty, Type::getInt32Ty(c), void_ptr};
+  make_function(cusan_memcpy_2d_async, arg_types_memcpy_2d_async);
+
+  // void* devPtr, size_t pitch, size_t width, size_t height, cudaStream_t stream = 0
+  ArgTypes arg_types_memset_2d_async = {void_ptr, size_t_ty, size_t_ty, size_t_ty, void_ptr};
+  make_function(cusan_memset_2d_async, arg_types_memset_2d_async);
+
+  //  void* devPtr, size_t pitch, size_t width, size_t height
+  ArgTypes arg_types_2d_memset = {void_ptr, size_t_ty, size_t_ty, size_t_ty};
+  make_function(cusan_memset_2d, arg_types_2d_memset);
+}
+
+}  // namespace cusan::callback
diff --git a/lib/pass/FunctionDecl.h b/lib/pass/FunctionDecl.h
index 2a32ef7..d40e80a 100644
--- a/lib/pass/FunctionDecl.h
+++ b/lib/pass/FunctionDecl.h
@@ -10,29 +10,30 @@
 #include <llvm/IR/Function.h>
 #include <llvm/IR/Module.h>
 
-using namespace llvm;
 namespace cusan {
 namespace callback {
 struct CusanFunction {
   const std::string name;
-  FunctionCallee f{nullptr};
-  SmallVector<Type*, 4> arg_types{};
+  llvm::FunctionCallee f{nullptr};
+  llvm::SmallVector<llvm::Type*, 4> arg_types{};
 };
 
 struct FunctionDecl {
   CusanFunction cusan_register_access{"_cusan_kernel_register"};
-
   CusanFunction cusan_event_record{"_cusan_event_record"};
-
   CusanFunction cusan_sync_device{"_cusan_sync_device"};
   CusanFunction cusan_sync_stream{"_cusan_sync_stream"};
   CusanFunction cusan_sync_event{"_cusan_sync_event"};
   CusanFunction cusan_event_create{"_cusan_create_event"};
   CusanFunction cusan_stream_create{"_cusan_create_stream"};
+  CusanFunction cusan_memset_2d_async{"_cusan_memset_2d_async"};
   CusanFunction cusan_memset_async{"_cusan_memset_async"};
   CusanFunction cusan_memcpy_async{"_cusan_memcpy_async"};
   CusanFunction cusan_memset{"_cusan_memset"};
+  CusanFunction cusan_memset_2d{"_cusan_memset_2d"};
   CusanFunction cusan_memcpy{"_cusan_memcpy"};
+  CusanFunction cusan_memcpy_2d{"_cusan_memcpy_2d"};
+  CusanFunction cusan_memcpy_2d_async{"_cusan_memcpy_2d_async"};
   CusanFunction cusan_stream_wait_event{"_cusan_stream_wait_event"};
   CusanFunction cusan_host_alloc{"_cusan_host_alloc"};
   CusanFunction cusan_managed_alloc{"_cusan_managed_alloc"};
@@ -44,109 +45,7 @@ struct FunctionDecl {
   CusanFunction cusan_stream_query{"_cusan_stream_query"};
   CusanFunction cusan_event_query{"_cusan_event_query"};
 
-  void initialize(Module& m) {
-    using namespace llvm;
-    auto& c = m.getContext();
-
-    const auto add_optimizer_attributes = [&](auto& arg) {
-      arg.addAttr(Attribute::NoCapture);
-      arg.addAttr(Attribute::ReadOnly);
-    };
-
-    const auto make_function = [&](auto& f_struct, auto f_types) {
-      auto func_type     = f_types.empty() ? FunctionType::get(Type::getVoidTy(c), false)
-                                           : FunctionType::get(Type::getVoidTy(c), f_types, false);
-      auto func_callee   = m.getOrInsertFunction(f_struct.name, func_type);
-      f_struct.f         = func_callee;
-      f_struct.arg_types = std::move(f_types);
-      if (auto f = dyn_cast<Function>(f_struct.f.getCallee())) {
-        f->setLinkage(GlobalValue::ExternalLinkage);
-        if (f->arg_size() == 0) {
-          return;
-        }
-        auto& first_param = *(f->arg_begin());
-        if (first_param.getType()->isPointerTy()) {
-          add_optimizer_attributes(first_param);
-        }
-      }
-    };
-    using ArgTypes = decltype(CusanFunction::arg_types);
-    // TODO address space?
-    ArgTypes arg_types_cusan_register = {PointerType::get(Type::getInt8PtrTy(c), 0), Type::getInt16PtrTy(c),
-                                         Type::getInt32Ty(c), Type::getInt8PtrTy(c)};
-    make_function(cusan_register_access, arg_types_cusan_register);
-
-    ArgTypes arg_types_sync_device = {};
-    make_function(cusan_sync_device, arg_types_sync_device);
-
-    ArgTypes arg_types_sync_stream = {Type::getInt8PtrTy(c)};
-    make_function(cusan_sync_stream, arg_types_sync_stream);
-
-    ArgTypes arg_types_sync_event = {Type::getInt8PtrTy(c)};
-    make_function(cusan_sync_event, arg_types_sync_event);
-    ArgTypes arg_types_event_record = {Type::getInt8PtrTy(c), Type::getInt8PtrTy(c)};
-    make_function(cusan_event_record, arg_types_event_record);
-
-    ArgTypes arg_types_event_create = {Type::getInt8PtrTy(c)};
-    make_function(cusan_event_create, arg_types_event_create);
-
-    ArgTypes arg_types_stream_create = {Type::getInt8PtrTy(c), Type::getInt32Ty(c)};
-    make_function(cusan_stream_create, arg_types_stream_create);
-
-    auto size_t_ty = m.getDataLayout().getIntPtrType(c);
-
-    // void* devPtr, int  value, size_t count, RawStream* stream
-    ArgTypes arg_types_memset_async = {Type::getInt8PtrTy(c), Type::getInt32Ty(c), size_t_ty, Type::getInt8PtrTy(c)};
-    make_function(cusan_memset_async, arg_types_memset_async);
-
-    // void* dst, const void* src
-    ArgTypes arg_types_memcpy_async = {Type::getInt8PtrTy(c), Type::getInt8PtrTy(c),
-                                       // size_t count, MemcpyKind kind, RawStream stream
-                                       size_t_ty, Type::getInt32Ty(c), Type::getInt8PtrTy(c)};
-    make_function(cusan_memcpy_async, arg_types_memcpy_async);
-
-    // void* devPtr, int  value, size_t count
-    ArgTypes arg_types_memset = {Type::getInt8PtrTy(c), Type::getInt32Ty(c), size_t_ty};
-    make_function(cusan_memset, arg_types_memset);
-
-    // void* dst, const void* src
-    ArgTypes arg_types_memcpy = {Type::getInt8PtrTy(c), Type::getInt8PtrTy(c),
-                                 // size_t count, MemcpyKind kind
-                                 size_t_ty, Type::getInt32Ty(c)};
-    make_function(cusan_memcpy, arg_types_memcpy);
-
-    ArgTypes arg_types_stream_wait_event = {Type::getInt8PtrTy(c), Type::getInt8PtrTy(c), Type::getInt32Ty(c)};
-    make_function(cusan_stream_wait_event, arg_types_stream_wait_event);
-
-    ArgTypes arg_types_host_alloc = {Type::getInt8PtrTy(c), size_t_ty, Type::getInt32Ty(c)};
-    make_function(cusan_host_alloc, arg_types_host_alloc);
-
-    ArgTypes arg_types_host_register = {Type::getInt8PtrTy(c), size_t_ty, Type::getInt32Ty(c)};
-    make_function(cusan_host_register, arg_types_host_register);
-
-    ArgTypes arg_types_host_unregister = {Type::getInt8PtrTy(c)};
-    make_function(cusan_host_unregister, arg_types_host_unregister);
-
-    ArgTypes arg_types_host_free = {Type::getInt8PtrTy(c)};
-    make_function(cusan_host_free, arg_types_host_free);
-
-    ArgTypes arg_types_managed_alloc = {Type::getInt8PtrTy(c), size_t_ty, Type::getInt32Ty(c)};
-    make_function(cusan_managed_alloc, arg_types_managed_alloc);
-
-    ArgTypes arg_device_alloc = {Type::getInt8PtrTy(c), size_t_ty};
-    make_function(cusan_device_alloc, arg_device_alloc);
-
-    ArgTypes arg_device_free = {Type::getInt8PtrTy(c)};
-    make_function(cusan_device_free, arg_device_free);
-
-    // RawStream stream, u32 return_errType
-    ArgTypes arg_stream_query = {Type::getInt8PtrTy(c), Type::getInt32Ty(c)};
-    make_function(cusan_stream_query, arg_stream_query);
-
-    // Event stream, u32 return_errType
-    ArgTypes arg_event_query = {Type::getInt8PtrTy(c), Type::getInt32Ty(c)};
-    make_function(cusan_event_query, arg_event_query);
-  }
+  void initialize(llvm::Module& m);
 };
 
 }  // namespace callback
diff --git a/lib/runtime/CMakeLists.txt b/lib/runtime/CMakeLists.txt
index e997d65..651f6f0 100644
--- a/lib/runtime/CMakeLists.txt
+++ b/lib/runtime/CMakeLists.txt
@@ -14,7 +14,9 @@ target_compile_features(cusan_mpi_interceptor PUBLIC cxx_std_17)
 target_compile_definitions(
   cusan_mpi_interceptor
   PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
+  LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
    $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
+   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
    $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
 )
 
@@ -72,7 +74,12 @@ add_library(cusan::Runtime ALIAS cusan_Runtime)
 target_compile_features(cusan_Runtime PUBLIC cxx_std_17)
 
 # latter for cudaSpecific.cpp:
-target_link_libraries(cusan_Runtime PRIVATE typeart::Runtime CUDA::cudart)
+target_link_libraries(cusan_Runtime PRIVATE CUDA::cudart)
+
+if(CUSAN_TYPEART)
+  target_link_libraries(cusan_Runtime PRIVATE typeart::Runtime)
+endif()
+target_link_libraries(cusan_Runtime PRIVATE LLVMSupport)
 
 if(CUSAN_FIBERPOOL)
   target_link_libraries(cusan_Runtime PUBLIC cusan::fiberpool)
@@ -98,8 +105,10 @@ cusan_target_define_file_basename(cusan_Runtime)
 target_compile_definitions(
   cusan_Runtime
   PRIVATE CUSAN_LOG_LEVEL=${CUSAN_LOG_LEVEL_RT}
+   LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}
    $<$<BOOL:${CUSAN_SYNC_DETAIL_LEVEL}>:CUSAN_SYNC_DETAIL_LEVEL=1>
    $<$<BOOL:${CUSAN_FIBERPOOL}>:CUSAN_FIBERPOOL=1>
+   $<$<BOOL:${CUSAN_TYPEART}>:CUSAN_TYPEART=1>
    $<$<BOOL:${CUSAN_SOFTCOUNTER}>:CUSAN_SOFTCOUNTER>
 )
 
diff --git a/lib/runtime/CusanRuntime.cpp b/lib/runtime/CusanRuntime.cpp
index 9236821..a859af4 100644
--- a/lib/runtime/CusanRuntime.cpp
+++ b/lib/runtime/CusanRuntime.cpp
@@ -6,12 +6,16 @@
 
 #include "CusanRuntime.h"
 // clang-format off
+#ifdef CUSAN_TYPEART
 #include "RuntimeInterface.h"
+#endif
 #include "analysis/KernelModel.h"
 #include "support/Logger.h"
-#include "TSan_External.h"
 #include "StatsCounter.h"
+#if CUSAN_SOFTCOUNTER
 #include "support/Table.h"
+#endif
+#include "TSanInterface.h"
 // clang-format on
 #include <cstddef>
 #include <iostream>
@@ -43,6 +47,17 @@ struct AllocationInfo {
   size_t size;
   bool is_pinned  = false;
   bool is_managed = false;
+  bool on_device  = false;
+
+  static constexpr AllocationInfo Device(size_t size) {
+    return AllocationInfo{size, false, false, true};
+  }
+  static constexpr AllocationInfo Pinned(size_t size) {
+    return AllocationInfo{size, true, false, false};
+  }
+  static constexpr AllocationInfo Managed(size_t size) {
+    return AllocationInfo{size, false, true, false};
+  }
 };
 
 struct PtrAttribute {
@@ -52,7 +67,7 @@ struct PtrAttribute {
 
 PtrAttribute access_cast_back(short cb_value) {
   const short access = (cb_value >> 1);
-  const bool ptr     = cb_value & 1;
+  const bool ptr     = (cb_value & 1) != 0;
   return PtrAttribute{AccessState{access}, ptr};
 }
 
@@ -62,6 +77,7 @@ struct PointerAccess {
 };
 
 class Runtime {
+  // NOTE: assumed to be a ordered map so we can iterate in ascending order
   std::map<const void*, AllocationInfo> allocations_;
   std::map<Stream, TsanFiber> streams_;
   std::map<Event, Stream> events_;
@@ -70,9 +86,9 @@ class Runtime {
   bool init_ = false;
 
  public:
+  static constexpr Stream kDefaultStream = Stream();
   Recorder stats_recorder;
 
- public:
   static Runtime& get() {
     static Runtime run_t;
     if (!run_t.init_) {
@@ -83,7 +99,7 @@ class Runtime {
       run_t.curr_fiber_ = run_t.cpu_fiber_;
 
       // default '0' cuda stream
-      { run_t.register_stream(Stream()); }
+      { run_t.register_stream(kDefaultStream); }
 
       run_t.init_ = true;
     }
@@ -94,6 +110,10 @@ class Runtime {
 
   void operator=(const Runtime&) = delete;
 
+  [[nodiscard]] const std::map<const void*, AllocationInfo>& get_allocations() const {
+    return allocations_;
+  }
+
   void happens_before() {
     LOG_TRACE("[cusan]    HappensBefore of curr fiber")
     TsanHappensBefore(curr_fiber_);
@@ -105,11 +125,11 @@ class Runtime {
     // if we where one a default stream we should also post sync
     // meaning that all work submitted after from the cpu should also be run after the default kernels are done
     // TODO: double check with blocking
-    auto search_result = streams_.find(Stream());
+    auto search_result = streams_.find(Runtime::kDefaultStream);
     assert(search_result != streams_.end() && "Tried using stream that wasn't created prior");
     if (curr_fiber_ == search_result->second) {
       LOG_TRACE("[cusan]        syncing all other blocking GPU streams to run after since its default stream")
-      for (auto& [s, sync_var] : streams_) {
+      for (const auto& [s, sync_var] : streams_) {
         if (s.isBlocking && !s.isDefaultStream()) {
           LOG_TRACE("[cusan]        happens before " << s.handle)
           TsanHappensBefore(sync_var);
@@ -125,11 +145,14 @@ class Runtime {
   }
 
   void register_stream(Stream stream) {
-    auto search_result = streams_.find(stream);
+    static uint32_t n_streams = 0;
+    auto search_result        = streams_.find(stream);
     assert(search_result == streams_.end() && "Registered stream twice");
     TsanFiber fiber = TsanCreateFiber(0);
     stats_recorder.inc_TsanCreateFiber();
-    TsanSetFiberName(fiber, "cuda_stream");
+    char name[32];
+    snprintf(name, 32, "cuda_stream %u", n_streams++);
+    TsanSetFiberName(fiber, name);
     streams_.insert({stream, fiber});
   }
 
@@ -156,7 +179,7 @@ class Runtime {
 
   void happens_after_all_streams(bool onlyBlockingStreams = false) {
     LOG_TRACE("[cusan]    happens_after_all_streams but only blocking ones: " << onlyBlockingStreams)
-    for (auto [stream, fiber] : streams_) {
+    for (const auto& [stream, fiber] : streams_) {
       if (!onlyBlockingStreams || stream.isBlocking) {
         LOG_TRACE("[cusan]        happens after " << stream.handle)
         TsanHappensAfter(fiber);
@@ -222,6 +245,11 @@ class Runtime {
 #define cusan_stat_handle(name) table.put(Row::make(#name, stats_recorder.get_##name()));
 #if CUSAN_SOFTCOUNTER
     Table table{"Cusan runtime statistics"};
+#ifdef CUSAN_FIBERPOOL
+    table.put(Row::make("Fiberpool", 1));
+#else
+    table.put(Row::make("Fiberpool", 0));
+#endif
     CUSAN_CUDA_EVENT_LIST
 #include "TsanEvents.inc"
     table.put(Row::make("TsanMemoryReadSize[KB]", stats_recorder.stats_r.getAvg() / 1024.0));
@@ -247,8 +275,55 @@ cusan_MemcpyKind infer_memcpy_direction(const void* target, const void* from);
 
 using namespace cusan::runtime;
 
+namespace helper {
+#ifndef CUSAN_TYPEART
+inline std::optional<size_t> find_memory_alloc_size(const Runtime& runtime, const void* ptr) {
+  const auto& allocs = runtime.get_allocations();
+
+  // if there exists any allocation
+  if (allocs.size() > 0) {
+    // find the first allocation that is bigger or equal then what we search for
+    const auto subsequent_alloc = allocs.lower_bound(ptr);
+
+    // if its equal we got our match
+    if (subsequent_alloc->first == ptr) {
+      return subsequent_alloc->second.size;
+    }
+    // else if there exists a previous allocation
+    else if (subsequent_alloc != allocs.begin()) {
+      // it is the only one that might include our pointer
+      // since all allocations are non overlapping and the start of the allocation needs to be smaller then our ptr
+      const auto& alloc = *std::prev(subsequent_alloc);
+      assert(alloc.first <= ptr);
+      // still got to verify were inside though
+      if (((const char*)alloc.first + alloc.second.size) >= ptr) {
+        return alloc.second.size;
+      }
+    }
+  }
+  return {};
+}
+#else
+inline std::optional<size_t> find_memory_alloc_size(const Runtime&, const void* ptr) {
+  size_t alloc_size{0};
+  int alloc_id{0};
+  auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size);
+  if (query_status != TYPEART_OK) {
+    LOG_TRACE(" [cusan]    Querying allocation length failed on " << ptr << ". Code: " << int(query_status))
+    return {};
+  }
+  const auto bytes_for_type = typeart_get_type_size(alloc_id);
+  const auto total_bytes    = bytes_for_type * alloc_size;
+  LOG_TRACE(" [cusan]    Querying allocation length of " << ptr << ". Code: " << int(query_status) << "  with size"
+                                                         << total_bytes)
+  return total_bytes;
+}
+#endif
+}  // namespace helper
+
 void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream stream) {
   LOG_TRACE("[cusan]Kernel Register with " << n << " Args and on stream:" << stream)
+  auto& runtime = Runtime::get();
 
   llvm::SmallVector<size_t, 4> sizes;
   for (int i = 0; i < n; ++i) {
@@ -258,24 +333,17 @@ void _cusan_kernel_register(void** kernel_args, short* modes, int n, RawStream s
       continue;
     }
 
-    size_t alloc_size{0};
-    int alloc_id{0};
-    auto* ptr         = kernel_args[i];
-    auto query_status = typeart_get_type(ptr, &alloc_id, &alloc_size);
-    if (query_status != TYPEART_OK) {
-      LOG_TRACE(" [cusan]    Querying allocation length failed on " << ptr << ". Code: " << int(query_status))
+    const auto* ptr          = kernel_args[i];
+    const auto size_in_bytes = helper::find_memory_alloc_size(runtime, ptr);
+    if (!size_in_bytes) {
+      LOG_TRACE(" [cusan]    Querying allocation length failed on " << ptr);
       sizes.push_back(0);
       continue;
     }
 
-    const auto bytes_for_type = typeart_get_type_size(alloc_id);
-    const auto total_bytes    = bytes_for_type * alloc_size;
-    LOG_TRACE(" [cusan]    Querying allocation length of " << ptr << ". Code: " << int(query_status) << "  with size "
-                                                           << total_bytes)
-    sizes.push_back(total_bytes);
+    sizes.push_back(size_in_bytes.value());
   }
 
-  auto& runtime = Runtime::get();
   runtime.stats_recorder.inc_kernel_register_calls();
   runtime.switch_to_stream(Stream(stream));
   for (int i = 0; i < n; ++i) {
@@ -351,164 +419,7 @@ void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags) {
   runtime.register_stream(Stream(*stream, !(bool)(flags & cusan_StreamFlagsNonBlocking)));
 }
 
-void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) {
-  // NOTE: at least for cuda non async memcpy is beheaving like on the default stream
-  // https://forums.developer.nvidia.com/t/is-cudamemcpyasync-cudastreamsynchronize-on-default-stream-equal-to-cudamemcpy-non-async/108853/5
-  LOG_TRACE("[cusan]Memcpy " << count << " bytes from:" << from << " to:" << target)
-
-  if (kind == cusan_MemcpyDefault) {
-    kind = infer_memcpy_direction(target, from);
-  }
-
-  auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_memcpy_calls();
-  if (CUSAN_SYNC_DETAIL_LEVEL == 0) {
-    LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction
-    runtime.switch_to_stream(Stream());
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-    runtime.happens_after_stream(Stream());
-  } else if (kind == cusan_MemcpyDeviceToDevice) {
-    // 4. For transfers from device memory to device memory, no host-side synchronization is performed.
-    LOG_TRACE("[cusan]   DefaultStream")
-    runtime.switch_to_stream(Stream());
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-  } else if (kind == cusan_MemcpyDeviceToHost) {
-    // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy
-    // has completed.
-    LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Stream());
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-    runtime.happens_after_stream(Stream());
-  } else if (kind == cusan_MemcpyHostToDevice) {
-    // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is
-    // initiated.
-
-    auto* alloc_info = runtime.get_allocation_info(from);
-    // if we couldn't find alloc info we just assume the worst and don't sync
-    if (alloc_info && !alloc_info->is_pinned) {
-      runtime.happens_after_stream(Stream());
-      LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    } else {
-      LOG_TRACE("[cusan]   DefaultStream")
-    }
-    //   The function will return once the pageable buffer has been copied to the staging memory for DMA transfer to
-    //   device memory
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    runtime.switch_to_stream(Stream());
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-    runtime.happens_after_stream(Stream());
-  } else if (kind == cusan_MemcpyHostToHost) {
-    // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
-    // host.
-    LOG_TRACE("[cusan]   DefaultStream+Blocking")
-    runtime.switch_to_stream(Stream());
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-    runtime.happens_after_stream(Stream());
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-  } else {
-    assert(false && "Should be unreachable");
-  }
-}
-
-void _cusan_memset(void* target, int, size_t count) {
-  // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host
-  // memory.
-  LOG_TRACE("[cusan]Memset " << count << " bytes to:" << target)
-  auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_memset_calls();
-  runtime.switch_to_stream(Stream());
-  LOG_TRACE("[cusan]    " << "Write to " << target << " with size: " << count)
-  TsanMemoryWritePC(target, count, __builtin_return_address(0));
-  runtime.stats_recorder.inc_TsanMemoryWrite();
-  runtime.happens_before();
-  runtime.switch_to_cpu();
-
-  auto* alloc_info = runtime.get_allocation_info(target);
-  // if we couldn't find alloc info we just assume the worst and don't sync
-  if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) {
-    LOG_TRACE("[cusan]    " << "Memset is blocking")
-    runtime.happens_after_stream(Stream());
-  } else {
-    LOG_TRACE("[cusan]    " << "Memset is not blocking")
-    if (!alloc_info) {
-      LOG_DEBUG("[cusan]    Failed to get alloc info " << target);
-    } else if (!alloc_info->is_pinned && !alloc_info->is_managed) {
-      LOG_TRACE("[cusan]    Pinned:" << alloc_info->is_pinned << " Managed:" << alloc_info->is_managed)
-    }
-  }
-
-  // r.happens_after_stream(Stream());
-}
-
-void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) {
-  LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target)
-  auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_memcpy_async_calls();
-  if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) {
-    // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
-    // host.
-    LOG_TRACE("[cusan]   Blocking")
-    runtime.switch_to_stream(Stream(stream));
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-    runtime.happens_after_stream(Stream(stream));
-  } else {
-    // 1. For transfers between device memory and pageable host memory, the function *might* be synchronous with respect
-    // to host.
-    // 2. If pageable memory must first be staged to pinned memory, the driver *may* synchronize with the stream and
-    // stage the copy into pinned memory.
-    // 4. For all other transfers, the function should be fully asynchronous.
-    LOG_TRACE("[cusan]   not Blocking")
-    runtime.switch_to_stream(Stream(stream));
-    TsanMemoryReadPC(from, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryRead();
-    TsanMemoryWritePC(target, count, __builtin_return_address(0));
-    runtime.stats_recorder.inc_TsanMemoryWrite();
-    runtime.happens_before();
-    runtime.switch_to_cpu();
-  }
-}
-
-void _cusan_memset_async(void* target, int, size_t count, RawStream stream) {
-  // The Async versions are always asynchronous with respect to the host.
-  LOG_TRACE("[cusan]MemsetAsync" << count << " bytes to:" << target)
-  auto& runtime = Runtime::get();
-  runtime.stats_recorder.inc_memset_async_calls();
-  runtime.switch_to_stream(Stream(stream));
-  TsanMemoryWritePC(target, count, __builtin_return_address(0));
-  runtime.stats_recorder.inc_TsanMemoryWrite();
-  runtime.happens_before();
-  runtime.switch_to_cpu();
-}
-
-void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags) {
+void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int) {
   LOG_TRACE("[cusan]StreamWaitEvent stream:" << stream << " on event:" << event)
   auto& runtime = Runtime::get();
   runtime.stats_recorder.inc_stream_wait_event_calls();
@@ -540,7 +451,7 @@ void _cusan_host_register(void* ptr, size_t size, unsigned int) {
   LOG_TRACE("[cusan]host register " << ptr << " with size:" << size);
   auto& runtime = Runtime::get();
   runtime.stats_recorder.inc_host_register_calls();
-  runtime.insert_allocation(ptr, AllocationInfo{size, true, false});
+  runtime.insert_allocation(ptr, AllocationInfo::Pinned(size));
 }
 void _cusan_host_unregister(void* ptr) {
   LOG_TRACE("[cusan]host unregister " << ptr);
@@ -554,7 +465,7 @@ void _cusan_managed_alloc(void** ptr, size_t size, unsigned int) {
   auto& runtime = Runtime::get();
   runtime.stats_recorder.inc_managed_alloc_calls();
   runtime.happens_after_all_streams();
-  runtime.insert_allocation(*ptr, AllocationInfo{size, false, true});
+  runtime.insert_allocation(*ptr, AllocationInfo::Managed(size));
 }
 
 void _cusan_device_alloc(void** ptr, size_t size) {
@@ -563,6 +474,8 @@ void _cusan_device_alloc(void** ptr, size_t size) {
   LOG_TRACE("[cusan]Device alloc " << *ptr << " with size " << size << " -> implicit device sync")
   auto& runtime = Runtime::get();
   runtime.stats_recorder.inc_device_alloc_calls();
+
+  runtime.insert_allocation(*ptr, AllocationInfo::Device(size));
   // runtime.switch_to_stream(Stream());
   // runtime.switch_to_cpu();
 }
@@ -597,3 +510,199 @@ void _cusan_event_query(Event event, unsigned int err) {
     runtime.sync_event(event);
   }
 }
+
+void _cusan_memset_async_impl(void* target, size_t count, RawStream stream) {
+  // The Async versions are always asynchronous with respect to the host.
+  auto& runtime = Runtime::get();
+  runtime.stats_recorder.inc_memset_async_calls();
+  runtime.switch_to_stream(Stream(stream));
+  TsanMemoryWritePC(target, count, __builtin_return_address(0));
+  runtime.stats_recorder.inc_TsanMemoryWrite();
+  runtime.happens_before();
+  runtime.switch_to_cpu();
+}
+void _cusan_memset_impl(void* target, size_t count) {
+  // The cudaMemset functions are asynchronous with respect to the host except when the target memory is pinned host
+  // memory.
+  auto& runtime = Runtime::get();
+  runtime.stats_recorder.inc_memset_calls();
+  runtime.switch_to_stream(Runtime::kDefaultStream);
+  LOG_TRACE("[cusan]    " << "Write to " << target << " with size: " << count)
+  TsanMemoryWritePC(target, count, __builtin_return_address(0));
+  runtime.stats_recorder.inc_TsanMemoryWrite();
+  runtime.happens_before();
+  runtime.switch_to_cpu();
+
+  auto* alloc_info = runtime.get_allocation_info(target);
+  // if we couldn't find alloc info we just assume the worst and don't sync
+  if ((alloc_info && (alloc_info->is_pinned || alloc_info->is_managed)) || CUSAN_SYNC_DETAIL_LEVEL == 0) {
+    LOG_TRACE("[cusan]    " << "Memset is blocking")
+    runtime.happens_after_stream(Runtime::kDefaultStream);
+  } else {
+    LOG_TRACE("[cusan]    " << "Memset is not blocking")
+    if (!alloc_info) {
+      LOG_DEBUG("[cusan]    Failed to get alloc info " << target);
+    } else if (!alloc_info->is_pinned && !alloc_info->is_managed) {
+      LOG_TRACE("[cusan]    Pinned:" << alloc_info->is_pinned << " Managed:" << alloc_info->is_managed)
+    }
+  }
+
+  // r.happens_after_stream(Runtime::default_stream));
+}
+
+void _cusan_memset_2d(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind) {
+  _cusan_memset_impl(target, pitch * height);
+}
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t, size_t height, cusan_MemcpyKind, RawStream stream) {
+  _cusan_memset_async_impl(target, pitch * height, stream);
+}
+
+void _cusan_memset(void* target, size_t count) {
+  LOG_TRACE("[cusan]Memset " << count << " bytes to:" << target)
+  _cusan_memset_impl(target, count);
+}
+
+void _cusan_memset_async(void* target, size_t count, RawStream stream) {
+  LOG_TRACE("[cusan]MemsetAsync" << count << " bytes to:" << target)
+  _cusan_memset_async_impl(target, count, stream);
+}
+
+void _cusan_memcpy_async_impl(void* target, size_t write_size, const void* from, size_t read_size,
+                              cusan_MemcpyKind kind, RawStream stream) {
+  auto& runtime = Runtime::get();
+  runtime.stats_recorder.inc_memcpy_async_calls();
+  if (kind == cusan_MemcpyHostToHost && CUSAN_SYNC_DETAIL_LEVEL == 1) {
+    // 2. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
+    // host.
+    LOG_TRACE("[cusan]   Blocking")
+    runtime.switch_to_stream(Stream(stream));
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+    runtime.happens_after_stream(Stream(stream));
+  } else {
+    // 1. For transfers between device memory and pageable host memory, the function *might* be synchronous with respect
+    // to host.
+    // 2. If pageable memory must first be staged to pinned memory, the driver *may* synchronize with the stream and
+    // stage the copy into pinned memory.
+    // 4. For all other transfers, the function should be fully asynchronous.
+    LOG_TRACE("[cusan]   not Blocking")
+    runtime.switch_to_stream(Stream(stream));
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+  }
+}
+
+void _cusan_memcpy_impl(void* target, size_t write_size, const void* from, size_t read_size, cusan_MemcpyKind kind) {
+  // TODO verify that the memcpy2d beheaviour is actually the same as normal memcpy
+
+  if (kind == cusan_MemcpyDefault) {
+    kind = infer_memcpy_direction(target, from);
+  }
+
+  auto& runtime = Runtime::get();
+  runtime.stats_recorder.inc_memcpy_calls();
+  if (CUSAN_SYNC_DETAIL_LEVEL == 0) {
+    LOG_TRACE("[cusan]   DefaultStream+Blocking")
+    // In this mode: Memcpy always blocks, no detailed view w.r.t. memory direction
+    runtime.switch_to_stream(Runtime::kDefaultStream);
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+    runtime.happens_after_stream(Runtime::kDefaultStream);
+  } else if (kind == cusan_MemcpyDeviceToDevice) {
+    // 4. For transfers from device memory to device memory, no host-side synchronization is performed.
+    LOG_TRACE("[cusan]   DefaultStream")
+    runtime.switch_to_stream(Runtime::kDefaultStream);
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+  } else if (kind == cusan_MemcpyDeviceToHost) {
+    // 3. For transfers from device to either pageable or pinned host memory, the function returns only once the copy
+    // has completed.
+    LOG_TRACE("[cusan]   DefaultStream+Blocking")
+    runtime.switch_to_stream(Runtime::kDefaultStream);
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+    runtime.happens_after_stream(Runtime::kDefaultStream);
+  } else if (kind == cusan_MemcpyHostToDevice) {
+    // 1. For transfers from pageable host memory to device memory, a stream sync is performed before the copy is
+    // initiated.
+
+    auto* alloc_info = runtime.get_allocation_info(from);
+    // if we couldn't find alloc info we just assume the worst and don't sync
+    if (alloc_info && !alloc_info->is_pinned) {
+      runtime.happens_after_stream(Runtime::kDefaultStream);
+      LOG_TRACE("[cusan]   DefaultStream+Blocking")
+    } else {
+      LOG_TRACE("[cusan]   DefaultStream")
+    }
+    //   The function will return once the pageable buffer has been copied to the staging memory for DMA transfer to
+    //   device memory
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    runtime.switch_to_stream(Runtime::kDefaultStream);
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+    runtime.happens_after_stream(Runtime::kDefaultStream);
+  } else if (kind == cusan_MemcpyHostToHost) {
+    // 5. For transfers from any host memory to any host memory, the function is fully synchronous with respect to the
+    // host.
+    LOG_TRACE("[cusan]   DefaultStream+Blocking")
+    runtime.switch_to_stream(Runtime::kDefaultStream);
+    runtime.happens_before();
+    runtime.switch_to_cpu();
+    runtime.happens_after_stream(Runtime::kDefaultStream);
+    TsanMemoryReadPC(from, read_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryRead();
+    TsanMemoryWritePC(target, write_size, __builtin_return_address(0));
+    runtime.stats_recorder.inc_TsanMemoryWrite();
+  } else {
+    assert(false && "Should be unreachable");
+  }
+}
+
+void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
+                            cusan_MemcpyKind kind, RawStream stream) {
+  LOG_TRACE("[cusan]MemcpyAsync" << width * height << " bytes to:" << target)
+
+  size_t read_size  = spitch * height;
+  size_t write_size = dpitch * height;
+  _cusan_memcpy_async_impl(target, write_size, from, read_size, kind, stream);
+}
+
+void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream) {
+  LOG_TRACE("[cusan]MemcpyAsync" << count << " bytes to:" << target)
+  _cusan_memcpy_async_impl(target, count, from, count, kind, stream);
+}
+
+void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
+                      cusan_MemcpyKind kind) {
+  LOG_TRACE("[cusan]Memcpy2d " << width * height << " from:" << from << " to:" << target);
+  size_t read_size  = spitch * height;
+  size_t write_size = dpitch * height;
+  _cusan_memcpy_impl(target, write_size, from, read_size, kind);
+}
+
+void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind kind) {
+  LOG_TRACE("[cusan]Memcpy " << count << " from:" << from << " to:" << target);
+  _cusan_memcpy_impl(target, count, from, count, kind);
+}
diff --git a/lib/runtime/CusanRuntime.h b/lib/runtime/CusanRuntime.h
index a51238f..8dcaef2 100644
--- a/lib/runtime/CusanRuntime.h
+++ b/lib/runtime/CusanRuntime.h
@@ -50,9 +50,16 @@ void _cusan_stream_event(Event event);
 void _cusan_create_event(RawStream* event);
 void _cusan_create_stream(RawStream* stream, cusan_StreamCreateFlags flags);
 void _cusan_memcpy_async(void* target, const void* from, size_t count, cusan_MemcpyKind kind, RawStream stream);
-void _cusan_memset_async(void* target, int, size_t count, RawStream stream);
+void _cusan_memset_async(void* target, size_t count, RawStream stream);
 void _cusan_memcpy(void* target, const void* from, size_t count, cusan_MemcpyKind);
-void _cusan_memset(void* target, int, size_t count);
+void _cusan_memcpy_2d(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
+                      cusan_MemcpyKind);
+void _cusan_memcpy_2d_async(void* target, size_t dpitch, const void* from, size_t spitch, size_t width, size_t height,
+                            cusan_MemcpyKind, RawStream stream);
+void _cusan_memset_2d(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind);
+void _cusan_memset_2d_async(void* target, size_t pitch, size_t width, size_t height, cusan_MemcpyKind,
+                            RawStream stream);
+void _cusan_memset(void* target, size_t count);
 void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
 void _cusan_stream_wait_event(RawStream stream, Event event, unsigned int flags);
 void _cusan_host_alloc(void** ptr, size_t size, unsigned int flags);
diff --git a/lib/runtime/MPIInterception.cpp b/lib/runtime/MPIInterception.cpp
index 4606efc..a69b1d8 100644
--- a/lib/runtime/MPIInterception.cpp
+++ b/lib/runtime/MPIInterception.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: BSD-3-Clause
 
 #include "StatsCounter.h"
-#include "TSan_External.h"
+#include "TSanInterface.h"
 #include "support/Table.h"
 
 #include <iostream>
diff --git a/lib/runtime/TSanInterface.h b/lib/runtime/TSanInterface.h
new file mode 100644
index 0000000..88d0f48
--- /dev/null
+++ b/lib/runtime/TSanInterface.h
@@ -0,0 +1,7 @@
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wformat"
+#pragma GCC diagnostic ignored "-Wmacro-redefined"
+
+#include "TSan_External.h"
+
+#pragma GCC diagnostic pop
\ No newline at end of file
diff --git a/lib/runtime/TSan_External.h b/lib/runtime/TSan_External.h
index f56bd9d..e817047 100644
--- a/lib/runtime/TSan_External.h
+++ b/lib/runtime/TSan_External.h
@@ -22,7 +22,7 @@ typedef unsigned short a16;
 typedef unsigned int a32;
 typedef unsigned long long a64;
 
-#include "cstdio"
+#include <cstdio>
 
 #ifdef CUSAN_FIBERPOOL
 #include "fiberpool.h"
@@ -94,12 +94,12 @@ void __attribute__((weak)) AnnotateRWLockCreate(const char* file, int line, cons
 void __attribute__((weak)) AnnotateRWLockDestroy(const char* file, int line, const volatile void* cv) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockAcquired(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
-void __attribute__((weak))
-AnnotateRWLockReleased(const char* file, int line, const volatile void* cv, unsigned long long is_w) {
+void __attribute__((weak)) AnnotateRWLockReleased(const char* file, int line, const volatile void* cv,
+                                                  unsigned long long is_w) {
   FALLBACK_PRINT(__func__);
 }
 
diff --git a/lib/support/Util.h b/lib/support/Util.h
index c9c1c45..01ee4f7 100644
--- a/lib/support/Util.h
+++ b/lib/support/Util.h
@@ -7,9 +7,12 @@
 #ifndef CUSAN_UTIL_H
 #define CUSAN_UTIL_H
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/IR/InstrTypes.h"
 
 #include <string>
+#include <type_traits>
 
 namespace cusan::util {
 
@@ -26,8 +29,12 @@ bool starts_with_any_of(const std::string& lhs, Strings&&... rhs) {
 
 template <typename String>
 inline std::string demangle(String&& s) {
-  std::string name = std::string{s};
-  auto demangle    = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
+  const std::string name = std::string{s};
+#if LLVM_VERSION_MAJOR >= 15
+  auto demangle = llvm::itaniumDemangle(name.data(), false);
+#else
+  auto* demangle = llvm::itaniumDemangle(name.data(), nullptr, nullptr, nullptr);
+#endif
   if (demangle && !std::string(demangle).empty()) {
     return {demangle};
   }
@@ -36,13 +43,36 @@ inline std::string demangle(String&& s) {
 
 template <typename T>
 inline std::string try_demangle(const T& site) {
-  if constexpr (std::is_same_v<T, llvm::Function>) {
+  if constexpr (std::is_same_v<std::remove_cv_t<T>, llvm::Function>) {
     return demangle(site.getName());
   } else {
     return demangle(site);
   }
 }
 
+template <typename String>
+inline std::string demangle_fully(String&& s) {
+  const std::string name = std::string{s};
+#if LLVM_VERSION_MAJOR >= 15
+  const auto demangle = llvm::demangle(name.data());
+#else
+  const auto demangle = llvm::demangle(name.data());
+#endif
+  if (!demangle.empty()) {
+    return demangle;
+  }
+  return name;
+}
+
+template <typename T>
+inline std::string try_demangle_fully(const T& site) {
+  if constexpr (std::is_same_v<std::remove_cv_t<T>, llvm::Function>) {
+    return demangle_fully(site.getName());
+  } else {
+    return demangle_fully(site);
+  }
+}
+
 }  // namespace cusan::util
 
 #endif  // CUSAN_UTIL_H
diff --git a/scripts/CMakeLists.txt b/scripts/CMakeLists.txt
index 08458da..d64b8ba 100644
--- a/scripts/CMakeLists.txt
+++ b/scripts/CMakeLists.txt
@@ -2,13 +2,18 @@ function(configure_cusan_script input output)
   cmake_parse_arguments(
     ARG "" "" "INSTALL_MODE;COMPILER;WITH_FILTER;APPLY_MODE" ${ARGN}
   )
-  set(TYPEART_SAN_FLAGS "")
+
+  set(CUSAN_BINARY_DIR ${cusan_BINARY_DIR})
 
   set(TYPEART_PROJECT_DIR ${typeart_SOURCE_DIR})
   set(TYPEART_BIN_DIR ${typeart_BINARY_DIR})
   set(TYPEART_SCRIPT_DIR ${TYPEART_PROJECT_DIR}/scripts)
 
   if(ARG_INSTALL_MODE)
+    set(CUSAN_RT_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
+    set(CUSAN_INCLUDE_DIRS "-I${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}")
+    set(CUSAN_RELOCATABLE 1)
+
     set(TYPEART_INCLUDE_DIRS
         "-I${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}"
     )
@@ -18,18 +23,21 @@ function(configure_cusan_script input output)
         ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}
     )
     set(TYPEART_BINARY_DIR -I${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR})
-    set(TYPEART_RELOCATABLE 1)
   else()
+    set(CUSAN_INCLUDE_DIRS "-I${PROJECT_SOURCE_DIR}/lib/runtime")
+    set(CUSAN_RELOCATABLE 0)
+
     set(TYPEART_INCLUDE_DIRS
         "-I${TYPEART_PROJECT_DIR}/lib/typelib -I${TYPEART_PROJECT_DIR}/lib/runtime -I${TYPEART_PROJECT_DIR}/lib/passes/typegen"
     )
-    if(LIBRARY_OUTPUT_PATH)
-      set(TYPEART_MPI_INTERCEPT_DIR ${LIBRARY_OUTPUT_PATH})
+    if(LIBRARY_OUTPUT_PATH)   
       set(TYPEART_RT_DIR ${LIBRARY_OUTPUT_PATH})
       set(TYPEART_PASS_DIR ${LIBRARY_OUTPUT_PATH})
       set(TYPEART_ANALYSIS_PASS_DIR ${LIBRARY_OUTPUT_PATH})
+
+      set(CUSAN_RT_DIR ${LIBRARY_OUTPUT_PATH})
+      set(CUSAN_PASS_DIR ${LIBRARY_OUTPUT_PATH})
     else()
-      set(TYPEART_MPI_INTERCEPT_DIR ${TYPEART_BIN_DIR}/lib/mpi_interceptor)
       set(TYPEART_RT_DIR ${TYPEART_BIN_DIR}/lib/runtime)
       set(TYPEART_PASS_DIR ${TYPEART_BIN_DIR}/lib/passes)
       set(TYPEART_ANALYSIS_PASS_DIR ${TYPEART_BIN_DIR}/analysis)
@@ -40,63 +48,50 @@ function(configure_cusan_script input output)
 
     if(EXECUTABLE_OUTPUT_PATH)
         set(TYPEART_BINARY_DIR ${EXECUTABLE_OUTPUT_PATH})
+        set(CUSAN_BINARY_DIR ${EXECUTABLE_OUTPUT_PATH})
     else()
         set(TYPEART_BINARY_DIR ${TYPEART_BIN_DIR})
-        set(CUSAN_BINARY_DIR ${CMAKE_BINARY_DIR})
+        set(CUSAN_BINARY_DIR ${CUSAN_BINARY_DIR})
     endif()
-
-    set(TYPEART_RELOCATABLE 0)
   endif()
 
   if(ARG_COMPILER)
-    set(TYPEART_COMPILER ${ARG_COMPILER})
+    set(CUSAN_COMPILER ${ARG_COMPILER})
   endif()
 
-  set(TYPEART_OPT "${TYPEART_OPT_EXEC}")
+  set(CUSAN_OPT "${CUSAN_OPT_EXEC}")
   if(${LLVM_VERSION_MAJOR} VERSION_GREATER_EQUAL "13")
-    set(TYPEART_OPT "${TYPEART_OPT} -enable-new-pm=0")
+    if(CUSAN_TYPEART)
+      set(CUSAN_OPT "${CUSAN_OPT} -enable-new-pm=0")
+    else()
+      set(CUSAN_NEW_PM_REQUIRED 1)
+    endif()
   endif()
 
-  set(TYPEART_LLC "${TYPEART_LLC_EXEC}")
-
   if(ARG_APPLY_MODE)
     set(TYPEART_RUN_SCRIPT 0)
+    set(CUSAN_RUN_SCRIPT 0)
   else()
     set(TYPEART_RUN_SCRIPT 1)
+    set(CUSAN_RUN_SCRIPT 1)
   endif()
 
   if(ARG_WITH_FILTER)
     set(TYPEART_CALLFILTER "--typeart-filter")
   endif()
 
-  #if(TYPEART_TSAN)
-    typeart_target_tsan_flags(TYPEART_SAN_FLAGS)
-  #endif()
-
-  if(TYPEART_ASAN)
-    typeart_target_asan_flags(asan_flags)
-    set(TYPEART_SAN_FLAGS ${TYPEART_SAN_FLAGS} ${asan_flags})
-  endif()
-
-  if(TYPEART_UBSAN)
-    typeart_target_ubsan_flags(ubsan_flags)
-    set(TYPEART_SAN_FLAGS ${TYPEART_SAN_FLAGS} ${ubsan_flags})
-  endif()
-
-  list(JOIN TYPEART_SAN_FLAGS " " TYPEART_SAN_FLAGS)
-
-  set(CUSAN_INCLUDE_DIRS "-I${PROJECT_SOURCE_DIR}/lib/runtime")
+  set(CUSAN_SAN_FLAG "-fsanitize=thread")
 
-  if(ARG_APPLY_MODE)
-    set(CUSAN_RUN_SCRIPT 0)
+  if(CUSAN_TYPEART)
+    set(CUSAN_WITH_TYPEART 1)
   else()
-    set(CUSAN_RUN_SCRIPT 1)
+  set(CUSAN_WITH_TYPEART 0)
   endif()
 
   cusan_target_generate_file(${input} ${output})
 endfunction()
 
-function(typeart_find_mpi_vendor_helper symbol ret_value)
+function(cusan_find_mpi_vendor_helper symbol ret_value)
   find_package(MPI)
 
   if(NOT MPI_FOUND)
@@ -121,7 +116,7 @@ function(typeart_find_mpi_vendor_helper symbol ret_value)
   check_c_source_compiles("${MPI_TEST_CODE}" ${ret_value})
 endfunction()
 
-function(set_typeart_mpi_compiler)
+function(set_cusan_mpi_compiler)
   # Note on mpich & Intel: Intel may also define \"MPICH_NAME\"
   # in mpi.h, so check MPICH last
   list(APPEND l_vendor OPEN_MPI INTEL_MPI MPICH)
@@ -130,21 +125,21 @@ function(set_typeart_mpi_compiler)
   list(APPEND l_env_cxx OMPI_CXX I_MPI_CXX MPICH_CXX)
 
   foreach(vendor symbol env_c env_cxx IN ZIP_LISTS l_vendor l_symbol l_env_c l_env_cxx)
-    typeart_find_mpi_vendor_helper(${symbol} TYPEART_HAVE_${vendor})
-    if(TYPEART_HAVE_${vendor})
+    cusan_find_mpi_vendor_helper(${symbol} CUSAN_HAVE_${vendor})
+    if(CUSAN_HAVE_${vendor})
       if(MPI_C_FOUND)
-        set(TYPEART_MPICC
-            "env ${env_c}=${TYPEART_CLANG_EXEC} ${MPI_C_COMPILER}"
-            CACHE STRING "TypeART MPICC compiler command for scripts"
+        set(CUSAN_MPICC
+            "env ${env_c}=${CUSAN_CLANG_EXEC} ${MPI_C_COMPILER}"
+            CACHE STRING "CuSan MPICC compiler command for scripts"
           )
-        mark_as_advanced(TYPEART_MPICC)
+        mark_as_advanced(CUSAN_MPICC)
       endif()
       if(MPI_CXX_FOUND)
-        set(TYPEART_MPICXX
-            "env ${env_cxx}=${TYPEART_CLANGCXX_EXEC} ${MPI_CXX_COMPILER}"
-            CACHE STRING "TypeART MPICXX compiler command for scripts"
+        set(CUSAN_MPICXX
+            "env ${env_cxx}=${CUSAN_CLANGCXX_EXEC} ${MPI_CXX_COMPILER}"
+            CACHE STRING "CuSan MPICXX compiler command for scripts"
         )
-        mark_as_advanced(TYPEART_MPICXX)
+        mark_as_advanced(CUSAN_MPICXX)
       endif()
       break()
     endif()
@@ -152,65 +147,88 @@ function(set_typeart_mpi_compiler)
 endfunction()
 
 if(MPI_FOUND)
-  set_typeart_mpi_compiler()
+  set_cusan_mpi_compiler()
+endif()
+
+set(CUSAN_WRAPPER cusan-wrapper.in)
+
+if(NOT CUSAN_TYPEART)
+  add_library(typeart_runtime_mock SHARED IMPORTED)
+  add_library(typeart::Runtime ALIAS typeart_runtime_mock)
+  set_target_properties(
+    typeart_runtime_mock
+    PROPERTIES
+    OUTPUT_NAME "${PROJECT_NAME}RuntimePlaceholder"
+    IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime"
+  )
+  add_library(typeart_transform_mock SHARED IMPORTED)
+  add_library(typeart::TransformPass ALIAS typeart_transform_mock)
+  set_target_properties(
+    typeart_transform_mock
+    PROPERTIES
+    OUTPUT_NAME "${PROJECT_NAME}TransformPassPlaceholder"
+    IMPORTED_LOCATION "${PROJECT_SOURCE_DIR}/lib/runtime"
+  )
 endif()
 
 find_package(MPI)
 if(MPI_C_FOUND)
   configure_cusan_script(
-    cusan-wrapper.in cusan-mpicc${CMAKE_DEBUG_POSTFIX}
+    ${CUSAN_WRAPPER} cusan-mpicc${CMAKE_DEBUG_POSTFIX}
     INSTALL_MODE ON
     WITH_FILTER ON
-    COMPILER "${TYPEART_MPICC}"
+    COMPILER "${CUSAN_MPICC}"
   )
   configure_cusan_script(
-    cusan-wrapper.in cusan-mpicc-test
+    ${CUSAN_WRAPPER} cusan-mpicc-test
     WITH_FILTER ON
-    COMPILER "${TYPEART_MPICC}"
+    COMPILER "${CUSAN_MPICC}"
   )
 endif()
 
 if(MPI_CXX_FOUND)
   configure_cusan_script(
-    cusan-wrapper.in cusan-mpic++${CMAKE_DEBUG_POSTFIX}
+    ${CUSAN_WRAPPER} cusan-mpic++${CMAKE_DEBUG_POSTFIX}
     INSTALL_MODE ON
     WITH_FILTER ON
-    COMPILER "${TYPEART_MPICXX}"
+    COMPILER "${CUSAN_MPICXX}"
   )
   configure_cusan_script(
-    cusan-wrapper.in cusan-mpic++-test
+    ${CUSAN_WRAPPER} cusan-mpic++-test
     WITH_FILTER ON
-    COMPILER "${TYPEART_MPICXX}"
+    COMPILER "${CUSAN_MPICXX}"
   )
 endif()
 
 configure_cusan_script(
-  cusan-wrapper.in cusan-clang${CMAKE_DEBUG_POSTFIX}
+  ${CUSAN_WRAPPER} cusan-clang${CMAKE_DEBUG_POSTFIX}
   INSTALL_MODE ON
-  COMPILER ${TYPEART_CLANG_EXEC}
+  COMPILER ${CUSAN_CLANG_EXEC}
 )
 configure_cusan_script(
-  cusan-wrapper.in cusan-clang++${CMAKE_DEBUG_POSTFIX}
+  ${CUSAN_WRAPPER} cusan-clang++${CMAKE_DEBUG_POSTFIX}
   INSTALL_MODE ON
-  COMPILER ${TYPEART_CLANGCXX_EXEC}
+  COMPILER ${CUSAN_CLANGCXX_EXEC}
 )
 
 configure_cusan_script(
-  cusan-wrapper.in cusan-clang-test
-  COMPILER ${TYPEART_CLANG_EXEC}
+  ${CUSAN_WRAPPER} cusan-clang-test
+  COMPILER ${CUSAN_CLANG_EXEC}
 )
 configure_cusan_script(
-  cusan-wrapper.in cusan-clang++-test
-  COMPILER ${TYPEART_CLANGCXX_EXEC}
+  ${CUSAN_WRAPPER} cusan-clang++-test
+  COMPILER ${CUSAN_CLANGCXX_EXEC}
 )
 
 #configure_cusan_script(cusan-tmpl.sh.in run.sh)
-configure_cusan_script(
-  cusan-tmpl.sh.in apply.sh
-  APPLY_MODE ON
-)
+#if(CUSAN_TYPEART)
+  configure_cusan_script(
+    cusan-tmpl.sh.in apply.sh
+    APPLY_MODE ON
+  )
+#endif()
 
-configure_cusan_script(cusan-wrapper.in cusan-clang.sh)
+configure_cusan_script(${CUSAN_WRAPPER} cusan-clang.sh)
 
 install(
   PROGRAMS
diff --git a/scripts/cusan-tmpl.sh.in b/scripts/cusan-tmpl.sh.in
index 9d7f0a3..7dabc6e 100644
--- a/scripts/cusan-tmpl.sh.in
+++ b/scripts/cusan-tmpl.sh.in
@@ -7,7 +7,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 
 function parse_cuda_cmd_line() {
-  typeart_non_cuda_args=""
+  cusan_non_cuda_args=""
   found_cuda=0
   cuda_code_arch="sm_50"
   cuda_compute_arch="compute_50"
@@ -19,7 +19,7 @@ function parse_cuda_cmd_line() {
       if [ "$2" == "cuda" ]; then
         found_cuda=1
       else
-        typeart_non_cuda_args="$typeart_non_cuda_args $1 $2"
+        cusan_non_cuda_args="$cusan_non_cuda_args $1 $2"
       fi
       shift 2
       ;;
@@ -33,13 +33,13 @@ function parse_cuda_cmd_line() {
       shift 1
       ;;
     *) # preserve other arguments
-      typeart_non_cuda_args="$typeart_non_cuda_args $1"
+      cusan_non_cuda_args="$cusan_non_cuda_args $1"
       shift
       ;;
     esac
   done
   # set other positional arguments in their proper place
-  set -- "typeart_non_cuda_args"
+  set -- "cusan_non_cuda_args"
 }
 
 function cusan_parse_cmd_line() {
@@ -101,6 +101,14 @@ function cusan_parse_cmd_line() {
       show_cuda_host_ir=1
       shift
     ;;
+    --cusan-kernel-data=*)
+      if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+        export CUSAN_KERNEL_DATA_FILE="${1##--cusan-kernel-data=}"
+      else
+        more_args="$more_args $1"    
+      fi
+      shift
+    ;;
     *) # preserve other arguments
       more_args="$more_args $1"
       shift
@@ -128,8 +136,6 @@ function cusan_parse_cmd_line() {
 }
 
 function cusan_global_init() {
-  typeart_global_init
-
   local -r cusan_bin_dir="@CUSAN_BINARY_DIR@"
   local -r cusan_lib_dir="@CUSAN_RT_DIR@"
   local -r cusan_include_dir="@CUSAN_INCLUDE_DIRS@"
@@ -140,34 +146,17 @@ function cusan_global_init() {
                    -l$<TARGET_FILE_BASE_NAME:cusan::Runtime>"
 
   readonly cusan_includes="${cusan_include_dir}"
-  readonly cusan_plugin="-load "${cusan_pass}" -cusan"
+
+  if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+    readonly cusan_plugin="-load-pass-plugin "${cusan_pass}" -passes=cusan"
+  else
+    readonly cusan_plugin="-load "${cusan_pass}" -cusan"
+  fi
+  
   readonly cusan_cpu_mode=""
   readonly cusan_gpu_mode=""
 }
 
-function typeart_global_init() {
-  local -r typeart_bin_dir="@TYPEART_BINARY_DIR@"
-  local -r typeart_lib_dir="@TYPEART_RT_DIR@"
-  local -r typeart_include_dir="@TYPEART_INCLUDE_DIRS@"
-  local -r typeart_pass="@TYPEART_PASS_DIR@/$<TARGET_FILE_NAME:typeart::TransformPass>"
-  readonly typeart_interceptor="@TYPEART_MPI_INTERCEPT_DIR@/@TYPEART_MPI_TOOL@"
-
-
-  readonly opt_tool="@TYPEART_OPT@"
-  readonly llc_tool="@TYPEART_LLC@"
-
-  readonly typeart_includes="${typeart_include_dir}"
-  readonly typeart_ldflags="-L${typeart_lib_dir}/ \
-                   -Wl,-rpath,${typeart_lib_dir}/ \
-                   -l$<TARGET_FILE_BASE_NAME:typeart::Runtime>"
-
-  # shellcheck disable=SC2027
-  readonly typeart_plugin="-load "${typeart_pass}" -typeart"
-  readonly typeart_stack_mode_args="-typeart-heap=false -typeart-stack -typeart-stats"
-  readonly typeart_heap_mode_args="-typeart-heap=true -typeart-stats"
-  readonly typeart_combined_mode_args="${typeart_heap_mode_args} -typeart-stack"
-}
-
 function cusan_toolchain_init() {
   readonly extension="${source_file##*.}"
 
@@ -186,26 +175,34 @@ function cusan_toolchain_init() {
     ;;
   esac
 
-  readonly cusan_san_flags="@CUSAN_SAN_FLAGS@ -fsanitize=thread"
+  readonly opt_tool="@CUSAN_OPT@"
+  readonly llc_tool="@CUSAN_LLC_EXEC@"
+  readonly cusan_san_flags="-fsanitize=thread"
 }
 
 function cusan_make_ir() {
   local cuda_add_flag="$1"
-  $compiler ${cuda_add_flag} ${omp_flags} ${typeart_includes} ${cusan_include_dir} ${cusan_san_flags} \
-   -O0 -Xclang -disable-O0-optnone -g ${compile_flags} -S -emit-llvm "${source_file}" -o - |
-   $opt_tool -mem2reg -S
-    # -O1 -Xclang -disable-llvm-passes -g ${compile_flags} -S -emit-llvm "${source_file}" -o -
-  
+
+  if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+    $compiler ${cuda_add_flag} ${omp_flags} ${cusan_include_dir} ${cusan_san_flags} \
+      -O0 -Xclang -disable-O0-optnone -g ${compile_flags} -D__STRICT_ANSI__ -S -emit-llvm "${source_file}" -o - |
+      $opt_tool -passes=mem2reg -S
+  else
+    $compiler ${cuda_add_flag} ${omp_flags} ${cusan_include_dir} ${cusan_san_flags} \
+      -O0 -Xclang -disable-O0-optnone -g ${compile_flags} -S -emit-llvm "${source_file}" -o - |
+      $opt_tool -mem2reg -S
+  fi
+
 }
 
 function cusan_source_to_llvm() {
   local cuda_add_flag=""
-  if [ $found_cuda == 1 ]; then
+  if [ "$found_cuda" == 1 ]; then
     cuda_add_flag=""
     cuda_add_flag+=" -x cuda"
-    if [ $show_cuda_ir == 1 ]; then
+    if [ "$show_cuda_ir" == 1 ]; then
       cuda_add_flag+=" --cuda-device-only --cuda-gpu-arch=${cuda_code_arch}"
-    elif [ $show_cuda_host_ir == 1 ]; then
+    elif [ "$show_cuda_host_ir" == 1 ]; then
       cuda_add_flag+=" --cuda-host-only"
       # generate kernel data, workaround
       cusan_device_pass no_out
@@ -227,32 +224,26 @@ function cusan_device_pass() {
 
 function cusan_apply_pass() {
   cusan_source_to_llvm |
-    $opt_tool ${typeart_plugin} ${typeart_heap_mode_args} ${ta_more_args} -S |
-    $opt_tool ${cusan_plugin} ${more_args} -S |
-    # $opt_tool  -passes="tsan-module,tsan" -S |
-    $opt_tool ${typeart_plugin} ${typeart_stack_mode_args} ${ta_more_args} -S 
+    $opt_tool ${cusan_plugin} ${more_args} -S
 }
 
 function cusan_apply_pass_optim() {
   cusan_source_to_llvm |
-    $opt_tool ${typeart_plugin} ${typeart_heap_mode_args} ${ta_more_args} |
     $opt_tool ${cusan_plugin} ${more_args} |
-    # $opt_tool  -passes="tsan-module,tsan" -S |
-    $opt_tool ${optimize} -S |
-    $opt_tool ${typeart_plugin} ${typeart_stack_mode_args} ${ta_more_args} -S
+    $opt_tool ${optimize} -S
 
 }
 
 function cusan_compile() {
   local llc_flags="--filetype=obj"
   if [ "$optimize" == "-O0" ]; then
-    if [ $show_cuda_ir == 0 ] && [ $show_cuda_host_ir == 0 ]; then
+    if [ "$show_cuda_ir" == 0 ] && [ $show_cuda_host_ir == 0 ]; then
       cusan_apply_pass | $llc_tool -x=ir ${llc_flags} -o "${object_file}"
     else
       cusan_apply_pass
     fi
   else
-    if [ $show_cuda_ir == 0 ] && [ $show_cuda_host_ir == 0 ]; then
+    if [ "$show_cuda_ir" == 0 ] && [ $show_cuda_host_ir == 0 ]; then
       cusan_apply_pass_optim | $llc_tool -x=ir ${llc_flags} -o "${object_file}"
     else
       cusan_apply_pass_optim
@@ -260,30 +251,17 @@ function cusan_compile() {
   fi
 }
 
-function cusan_main_link() {
-  $compiler ${cusan_san_flags} ${cusan_ldflags} "${object_file}" -o "${exe_file}"
-}
-
-function cusan_execute() {
-  export TSAN_OPTIONS='ignore_noninstrumented_modules=1'
-  echo -e Executing with runtime lib
-  LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${cusan_lib_dir}" "${exe_file}"
-}
-
 function parse_commands() {
   parse_cuda_cmd_line "$@"
-  cusan_parse_cmd_line ${typeart_non_cuda_args}
+  cusan_parse_cmd_line ${cusan_non_cuda_args}
 }
 
 function cusan_main_in() {
+  set +x
   parse_commands "$@"
   cusan_global_init
   cusan_toolchain_init
   cusan_compile
-  if [ @CUSAN_RUN_SCRIPT@ == 1 ]; then
-    cusan_main_link
-    cusan_execute
-  fi
 }
 
 cusan_main_in "$@"
diff --git a/scripts/cusan-wrapper.in b/scripts/cusan-wrapper.in
index 2e01406..54c80cd 100644
--- a/scripts/cusan-wrapper.in
+++ b/scripts/cusan-wrapper.in
@@ -13,7 +13,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 
-function typeart_global_env_var_init_fn() {
+function cusan_global_env_var_init_fn() {
   if [ -n "${TYPEART_WRAPPER_CONFIG+x}" ]; then
     typeart_cmdline_args_heap="${TYPEART_WRAPPER_CONFIG}"
     typeart_cmdline_args_stack="${TYPEART_WRAPPER_CONFIG}"
@@ -26,22 +26,17 @@ function typeart_global_env_var_init_fn() {
   fi
 
   # shellcheck disable=SC2153
-  case "${TYPEART_WRAPPER_EMIT_IR}" in
+  case "${CUSAN_WRAPPER_EMIT_IR}" in
   on | ON | 1 | true | TRUE)
-    typeart_wrapper_emit_ir=1
+    cusan_wrapper_emit_ir=1
     ;;
   *)
-    typeart_wrapper_emit_ir=0
+    cusan_wrapper_emit_ir=0
     ;;
   esac
 }
 
-function typeart_is_wrapper_disabled_fn() {
-  case "${TYPEART_WRAPPER}" in
-  off | OFF | 0 | false | FALSE)
-    return 1
-    ;;
-  esac
+function cusan_is_wrapper_disabled_fn() {
   case "${CUSAN_WRAPPER}" in
   off | OFF | 0 | false | FALSE)
     return 1
@@ -51,8 +46,8 @@ function typeart_is_wrapper_disabled_fn() {
 }
 
 function cusan_global_init_fn() {
-    local -r typeart_use_rel_path=@TYPEART_RELOCATABLE@
-    if [ "$typeart_use_rel_path" == 0 ]; then
+    local -r cusan_use_rel_path=@CUSAN_RELOCATABLE@
+    if [ "$cusan_use_rel_path" == 0 ]; then
       local -r cusan_bin_dir="@CUSAN_BINARY_DIR@"
       local -r cusan_lib_dir="@CUSAN_RT_DIR@"
       local -r cusan_include_dir="@CUSAN_INCLUDE_DIRS@"
@@ -71,18 +66,38 @@ function cusan_global_init_fn() {
                    -Wl,-rpath,${cusan_lib_dir}/ \
                    -l$<TARGET_FILE_BASE_NAME:cusan::Runtime>"
 
+  readonly cusan_san_flags="@CUSAN_SAN_FLAG@"
+
   readonly cusan_includes="${cusan_include_dir}"
-  readonly cusan_plugin="-load "${cusan_pass}" -cusan"
+  if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+    readonly cusan_plugin="-load-pass-plugin "${cusan_pass}" -passes=cusan"
+  else
+    readonly cusan_plugin="-load "${cusan_pass}" -cusan"
+  fi
   readonly cusan_cpu_mode=""
-  if [ ${typeart_wrapper_emit_ir} == 1 ]; then
+  if [ "${cusan_wrapper_emit_ir}" == 1 ]; then
     readonly cusan_gpu_mode=""
   else
     readonly cusan_gpu_mode="" #"--cusan-quiet"
   fi
+
+  readonly cusan_compiler="@CUSAN_COMPILER@"
+  readonly cusan_opt_tool="@CUSAN_OPT@"
+  readonly cusan_llc_tool="@CUSAN_LLC_EXEC@"
+
+  readonly cusan_to_llvm_flags="-O0 -Xclang -disable-O0-optnone -g -c -emit-llvm"
+  cusan_to_llvm_more_flags=""
+  if [ "${cusan_wrapper_emit_ir}" == 1 ]; then
+    cusan_to_llvm_more_flags="-fno-discard-value-names"
+  fi
+
+  if [ "@CUSAN_WITH_TYPEART@" == 1 ]; then
+    typeart_global_init_fn
+  fi
 }
 
 function typeart_global_init_fn() {
-  local -r typeart_use_rel_path=@TYPEART_RELOCATABLE@
+  local -r typeart_use_rel_path=@CUSAN_RELOCATABLE@
   if [ "$typeart_use_rel_path" == 0 ]; then
     local -r typeart_bin_dir="@TYPEART_BINARY_DIR@"
     local -r typeart_lib_dir="@TYPEART_RT_DIR@"
@@ -98,16 +113,11 @@ function typeart_global_init_fn() {
     local -r typeart_pass="${typeart_lib_dir}/$<TARGET_FILE_NAME:typeart::TransformPass>"
   fi
 
-  readonly typeart_compiler="@TYPEART_COMPILER@"
-  readonly typeart_opt_tool="@TYPEART_OPT@"
-  readonly typeart_llc_tool="@TYPEART_LLC@"
-
   readonly typeart_includes="${typeart_include_dir}"
   # shellcheck disable=SC2089
   readonly typeart_ldflags="-L${typeart_lib_dir}/ \
                    -Wl,-rpath,${typeart_lib_dir}/ \
                    -l$<TARGET_FILE_BASE_NAME:typeart::Runtime>"
-  readonly typeart_san_flags="@TYPEART_SAN_FLAGS@"
 
   # shellcheck disable=SC2027
   readonly typeart_plugin="-load "${typeart_pass}" -typeart"
@@ -117,21 +127,14 @@ function typeart_global_init_fn() {
   # Used for values passed to wrapper:
   typeart_cmdline_args_heap=""
   typeart_cmdline_args_stack=""
-  typeart_global_env_var_init_fn
-
-  readonly typeart_to_llvm_flags="-O0 -Xclang -disable-O0-optnone -g -c -emit-llvm"
-  typeart_to_llvm_more_flags=""
-  if [ ${typeart_wrapper_emit_ir} == 1 ]; then
-    typeart_to_llvm_more_flags="-fno-discard-value-names"
-  fi
 }
 
-function typeart_global_cuda_init_fn() {
+function cusan_global_cuda_init_fn() {
   # TODO: use generator expr. w.r.t. CUDA::cudart
-  readonly typeart_cuda_ldflags="-L@CUDAToolkit_LIBRARY_DIR@/ -lcudart -lcudart_static -lcudadevrt"
+  readonly cusan_cuda_ldflags="-L@CUDAToolkit_LIBRARY_DIR@/ -lcudart -lcudart_static -lcudadevrt"
 }
 
-function typeart_is_typeart_linking_fn() {
+function cusan_is_cusan_linking_fn() {
   local arg=""
   for arg in "$@"; do
     case "$arg" in
@@ -143,7 +146,7 @@ function typeart_is_typeart_linking_fn() {
   return 1
 }
 
-function typeart_has_source_fn() {
+function cusan_has_source_fn() {
   local arg=""
   for arg in "$@"; do
     local extension_of_arg="${arg##*.}"
@@ -156,7 +159,7 @@ function typeart_has_source_fn() {
   return 0
 }
 
-function typeart_skip_fn() {
+function cusan_skip_fn() {
   # -E inline header; -M list (all) headers; -MM list file deps
   local arg=""
   for arg in "$@"; do
@@ -169,15 +172,15 @@ function typeart_skip_fn() {
   return 0
 }
 
-function typeart_try_extract_source_fn() {
+function cusan_try_extract_source_fn() {
   # $1 == flag (source file); $2 == shift value
   local -r extension="${1##*.}"
   local -r shift_val="$2"
 
   case "$extension" in
   cpp | cxx | cc | c | cu)
-    typeart_source_file="$1"
-    typeart_found_src_file=1
+    cusan_source_file="$1"
+    cusan_found_src_file=1
     return "$shift_val"
     ;;
   *)
@@ -186,28 +189,28 @@ function typeart_try_extract_source_fn() {
   esac
 }
 
-function typeart_handle_source_flag_fn() {
+function cusan_handle_source_flag_fn() {
   if [ -n "$2" ]; then
-    typeart_try_extract_source_fn "$2" 2
+    cusan_try_extract_source_fn "$2" 2
   else
-    typeart_try_extract_source_fn "$1" 1
+    cusan_try_extract_source_fn "$1" 1
   fi
   return $?
 }
 
-function typeart_try_extract_object_fn() {
+function cusan_try_extract_object_fn() {
   # $1 == flag (obj file); $2 == shift value
   local -r extension="${1##*.}"
   local -r shift_val="$2"
 
   case "$extension" in
   o)
-    typeart_object_file="$1"
-    typeart_found_obj_file=1
+    cusan_object_file="$1"
+    cusan_found_obj_file=1
     return "$shift_val"
     ;;
   -)
-    typeart_to_stdout=1
+    cusan_to_stdout=1
     return "$shift_val"
     ;;
   *)
@@ -216,213 +219,194 @@ function typeart_try_extract_object_fn() {
   esac
 }
 
-function typeart_handle_object_flag_fn() {
+function cusan_handle_object_flag_fn() {
   if [ -n "$2" ]; then
-    typeart_try_extract_object_fn "$2" 2
+    cusan_try_extract_object_fn "$2" 2
   else
-    typeart_try_extract_object_fn "$1" 1
+    cusan_try_extract_object_fn "$1" 1
   fi
   return $?
 }
 
-function typeart_handle_binary_fn() {
+function cusan_handle_binary_fn() {
   if [ -n "$2" ]; then
-    typeart_exe_file="$2"
-    typeart_found_exe_file=1
+    cusan_exe_file="$2"
+    cusan_found_exe_file=1
   fi
   return 2
 }
 
-function typeart_parse_typeart_cmd_line_fn() {
-  typeart_other_args=""
-
-  while (("$#")); do
-    case "$1" in
-    --typeart-config=*)
-      typeart_cmdline_args_heap="${1##-typeart-config=}"
-      typeart_cmdline_args_stack="${1##-typeart-config=}"
-      shift
-      ;;
-    --typeart-heap-config=*)
-      typeart_cmdline_args_heap="${1##-typeart-heap-config=}"
-      shift
-      ;;
-    --typeart-stack-config=*)
-      typeart_cmdline_args_stack="${1##-typeart-stack-config=}"
-      shift
-      ;;
-    *) # preserve other arguments
-      typeart_other_args+=" $1"
-      shift
-      ;;
-    esac
-  done
-}
-
-function typeart_parse_cuda_cmd_line_fn() {
-  typeart_non_cuda_args=""
-  typeart_found_cuda=0
-  typeart_cuda_code_arch=""
-  typeart_cuda_compute_arch=""
+function cusan_parse_cuda_cmd_line_fn() {
+  cusan_non_cuda_args=""
+  cusan_found_cuda=0
+  cusan_cuda_code_arch=""
+  cusan_cuda_compute_arch=""
 
   while (("$#")); do
     case "$1" in
     *.cu)
-      typeart_found_cuda=1
-      typeart_non_cuda_args="$typeart_non_cuda_args $1"
+      cusan_found_cuda=1
+      cusan_non_cuda_args="$cusan_non_cuda_args $1"
       shift 1
       ;;
     -x)
       if [ "$2" == "cuda" ]; then
-        typeart_found_cuda=1
+        cusan_found_cuda=1
       else
-        typeart_non_cuda_args="$typeart_non_cuda_args $1 $2"
+        cusan_non_cuda_args="$cusan_non_cuda_args $1 $2"
       fi
       shift 2
       ;;
     --cuda-gpu-arch=*)
-      typeart_cuda_code_arch="${1##--cuda-gpu-arch=}"
-      typeart_cuda_compute_arch="compute${typeart_cuda_code_arch##sm}"
+      cusan_cuda_code_arch="${1##--cuda-gpu-arch=}"
+      cusan_cuda_compute_arch="compute${cusan_cuda_code_arch##sm}"
       shift 1
       ;;
     -code=*)
-      typeart_cuda_code_arch="${1##-code=}"
+      cusan_cuda_code_arch="${1##-code=}"
       shift 1
       ;;
     -arch=*)
-      typeart_cuda_compute_arch="${1##-arch=}"
+      cusan_cuda_compute_arch="${1##-arch=}"
       shift 1
       ;;
     -gencode)
       shift 1
       ;;
     arch=*)
-      typeart_cuda_compute_arch="${1%%,code=*}"
-      typeart_cuda_compute_arch="${typeart_cuda_compute_arch##arch=}"
-      typeart_cuda_code_arch="${1##*code=}"
+      cusan_cuda_compute_arch="${1%%,code=*}"
+      cusan_cuda_compute_arch="${cusan_cuda_compute_arch##arch=}"
+      cusan_cuda_code_arch="${1##*code=}"
       shift 1
       ;;
     *) # preserve other arguments
-      typeart_non_cuda_args="$typeart_non_cuda_args $1"
+      cusan_non_cuda_args="$cusan_non_cuda_args $1"
       shift
       ;;
     esac
   done
   # set other positional arguments in their proper place
-  set -- "typeart_non_cuda_args"
+  set -- "cusan_non_cuda_args"
 }
 
 # shellcheck disable=SC2034
-function typeart_parse_cmd_line_fn() {
-  typeart_found_src_file=0
-  typeart_found_obj_file=0
-  typeart_found_exe_file=0
-  typeart_found_fpic=0
-  typeart_skip=0
-  typeart_to_asm=0
-  typeart_exe_file=""
-  typeart_source_file=""
-  typeart_object_file=""
-  typeart_asm_file=""
-  typeart_wrapper_more_args=""
-  typeart_optimize=""
-  typeart_emit_llvm=0
-  typeart_to_stdout=0
+function cusan_parse_cmd_line_fn() {
+  cusan_found_src_file=0
+  cusan_found_obj_file=0
+  cusan_found_exe_file=0
+  cusan_found_fpic=0
+  cusan_skip=0
+  cusan_to_asm=0
+  cusan_exe_file=""
+  cusan_source_file=""
+  cusan_object_file=""
+  cusan_asm_file=""
+  cusan_wrapper_more_args=""
+  cusan_optimize=""
+  cusan_emit_llvm=0
+  cusan_to_stdout=0
 
   while (("$#")); do
     case "$1" in
     -O?)
-      typeart_optimize=$1
+      cusan_optimize=$1
       shift 1
       ;;
     -MT)
       if [ -n "$2" ]; then
-        typeart_wrapper_more_args+=" $1 $2"
+        cusan_wrapper_more_args+=" $1 $2"
         shift 2
       else
-        typeart_wrapper_more_args+=" $1"
+        cusan_wrapper_more_args+=" $1"
         shift 1
       fi
       ;;
     -S)
-      typeart_to_asm=1
+      cusan_to_asm=1
       shift 1
       ;;
     -c)
       shift 1
       ;;
     *.s | *.bc | *.ll)
-      typeart_asm_file="$1"
+      cusan_asm_file="$1"
       shift 1
       ;;
     *.cpp | *.cxx | *.cc | *.c | *.cu)
-      typeart_handle_source_flag_fn "$1"
+      cusan_handle_source_flag_fn "$1"
       shift $?
       ;;
     -o)
       # shellcheck disable=SC2154
-      if [ "$typeart_linking" == 1 ]; then
-        typeart_handle_binary_fn "$1" "$2"
+      if [ "$cusan_linking" == 1 ]; then
+        cusan_handle_binary_fn "$1" "$2"
       else
-        typeart_handle_object_flag_fn "$1" "$2"
+        cusan_handle_object_flag_fn "$1" "$2"
       fi
       shift $?
       ;;
     *.o)
-      if [ "$typeart_linking" == 0 ]; then
-        typeart_handle_object_flag_fn "$1"
+      if [ "$cusan_linking" == 0 ]; then
+        cusan_handle_object_flag_fn "$1"
         shift $?
       else
-        # when typeart_linking, we don't care about object files
-        typeart_wrapper_more_args+=" $1"
+        # when cusan_linking, we don't care about object files
+        cusan_wrapper_more_args+=" $1"
         shift 1
       fi
       ;;
     -fPIC)
       # llc requires special flag
-      typeart_found_fpic=1
-      typeart_wrapper_more_args+=" $1"
+      cusan_found_fpic=1
+      cusan_wrapper_more_args+=" $1"
       shift 1
       ;;
     -emit-llvm)
-      typeart_emit_llvm=1
+      cusan_emit_llvm=1
       shift 1
       ;;
+    --cusan-kernel-data=*)
+      if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+        export CUSAN_KERNEL_DATA_FILE="${1##--cusan-kernel-data=}"
+      else
+        cusan_wrapper_more_args+=" $1"    
+      fi
+      shift
+    ;;
     *) # preserve other arguments
-      typeart_wrapper_more_args+=" $1"
+      cusan_wrapper_more_args+=" $1"
       shift 1
       ;;
     esac
   done
 
-  if [ -z "${typeart_optimize}" ]; then
-    typeart_optimize=-O0
+  if [ -z "${cusan_optimize}" ]; then
+    cusan_optimize=-O0
   fi
 }
 
-function typeart_parse_commands_fn() {
-  typeart_parse_typeart_cmd_line_fn "$@"
-  typeart_parse_cuda_cmd_line_fn ${typeart_other_args}
-  typeart_parse_cmd_line_fn ${typeart_non_cuda_args}
+function cusan_parse_commands_fn() {
+  cusan_parse_cuda_cmd_line_fn "$@"
+  cusan_parse_cmd_line_fn ${cusan_non_cuda_args}
 }
 
-function typeart_parse_link_objects_fn() {
-  typeart_link_more_args=""
-  typeart_link_objs=""
-  typeart_cuda_link_objs=""
+function cusan_parse_link_objects_fn() {
+  cusan_link_more_args=""
+  cusan_link_objs=""
+  cusan_cuda_link_objs=""
 
   while (("$#")); do
     case "$1" in
       *.o)
-      typeart_link_objs="${typeart_link_objs} $1"
+      cusan_link_objs="${cusan_link_objs} $1"
       local object_dlink="${1%.*}_dlink.o"
       if [ -f "${object_dlink}" ]; then
-        typeart_cuda_link_objs="${typeart_cuda_link_objs} ${object_dlink}"
+        cusan_cuda_link_objs="${cusan_cuda_link_objs} ${object_dlink}"
       fi
       shift
       ;;
       *)
-      typeart_link_more_args+=" $1"
+      cusan_link_more_args+=" $1"
       shift
       ;;
     esac
@@ -430,170 +414,193 @@ function typeart_parse_link_objects_fn() {
 
 }
 
-function typeart_main_link_fn() {
-  typeart_parse_link_objects_fn "$@"
-  if [ "$typeart_found_cuda" == 1 ]; then
-    typeart_link_objs="${typeart_link_objs} ${typeart_cuda_link_objs}"
+function cusan_main_link_fn() {
+  cusan_parse_link_objects_fn "$@"
+  if [ "$cusan_found_cuda" == 1 ]; then
+    cusan_link_objs="${cusan_link_objs} ${cusan_cuda_link_objs}"
   fi
   # shellcheck disable=SC2086 disable=SC2068
-  $typeart_compiler ${cusan_includes} ${typeart_includes} ${typeart_ldflags} ${cusan_ldflags} ${typeart_cuda_ldflags} ${typeart_san_flags} ${typeart_link_more_args} ${typeart_link_objs}
+  $cusan_compiler ${cusan_ldflags} ${typeart_ldflags} ${cusan_cuda_ldflags} ${cusan_san_flags} ${cusan_link_more_args} ${cusan_link_objs}
 }
 
 # shellcheck disable=SC2068
-function typeart_redirect_fn() {
+function cusan_redirect_fn() {
   # First argument of $@ must be "redirect file name"
   # Rest are the std arguments for opt
-  if [ -z ${typeart_wrapper_emit_ir} ] || [ ${typeart_wrapper_emit_ir} -eq 0 ]; then
-    $typeart_command_exe ${@:2}
+  if [ -z ${cusan_wrapper_emit_ir} ] || [ ${cusan_wrapper_emit_ir} -eq 0 ]; then
+    $cusan_command_exe ${@:2}
   else
-    $typeart_command_exe -S ${@:2} | tee "${@:1:1}"
+    $cusan_command_exe -S ${@:2} | tee "${@:1:1}"
   fi
 }
 
-function typeart_opt_fn() {
-  local typeart_command_exe="$typeart_opt_tool"
-  typeart_redirect_fn "$@"
+function cusan_opt_fn() {
+  local cusan_command_exe="$cusan_opt_tool"
+  cusan_redirect_fn "$@"
 }
 
-function typeart_compiler_fn() {
-  local typeart_command_exe="$typeart_compiler"
-  typeart_redirect_fn "$@"
+function cusan_compiler_fn() {
+  local cusan_command_exe="$cusan_compiler"
+  cusan_redirect_fn "$@"
 }
 
 # shellcheck disable=SC2120
-function typeart_tu_out_fn() {
-  local out_file="${typeart_object_file}"
+function cusan_tu_out_fn() {
+  local out_file="${cusan_object_file}"
   local llc_flags="--filetype=obj"
 
-  if [ "$typeart_to_asm" == 1 ]; then
+  if [ "$cusan_to_asm" == 1 ]; then
     local llc_flags="--filetype=asm"
   fi
 
-  if [ -z "${typeart_asm_file}" ]; then
-    if [ "$typeart_emit_llvm" == 1 ] && [ "$typeart_to_asm" == 1 ]; then
-      local typeart_asm_file="${out_basename}".ll
-    elif [ "$typeart_emit_llvm" == 1 ]; then
-      local typeart_asm_file="${out_basename}".bc
-    elif [ "$typeart_to_asm" == 1 ]; then
-      local typeart_asm_file="${out_basename}".s
+  if [ -z "${cusan_asm_file}" ]; then
+    if [ "$cusan_emit_llvm" == 1 ] && [ "$cusan_to_asm" == 1 ]; then
+      local cusan_asm_file="${out_basename}".ll
+    elif [ "$cusan_emit_llvm" == 1 ]; then
+      local cusan_asm_file="${out_basename}".bc
+    elif [ "$cusan_to_asm" == 1 ]; then
+      local cusan_asm_file="${out_basename}".s
     fi
   fi
 
-  if [ "$typeart_emit_llvm" == 1 ] || [ "$typeart_to_asm" == 1 ]; then
-    local out_file="${typeart_asm_file}"
+  if [ "$cusan_emit_llvm" == 1 ] || [ "$cusan_to_asm" == 1 ]; then
+    local out_file="${cusan_asm_file}"
   fi
 
-  if [ "$typeart_found_fpic" == 1 ]; then
+  if [ "$cusan_found_fpic" == 1 ]; then
     local llc_flags+=" --relocation-model=pic"
   fi
 
-  if [ "$typeart_emit_llvm" == 1 ] && [ "$typeart_to_asm" == 1 ]; then
-    local typeart_command_exe="${typeart_opt_tool} -S"
-  elif [ "$typeart_emit_llvm" == 1 ]; then
-    local typeart_command_exe="${typeart_opt_tool} -f"
+  if [ "$cusan_emit_llvm" == 1 ] && [ "$cusan_to_asm" == 1 ]; then
+    local cusan_command_exe="${cusan_opt_tool} -S"
+  elif [ "$cusan_emit_llvm" == 1 ]; then
+    local cusan_command_exe="${cusan_opt_tool} -f"
   else
-    local typeart_command_exe="${typeart_llc_tool} -x=ir ${llc_flags}"
+    local cusan_command_exe="${cusan_llc_tool} -x=ir ${llc_flags}"
   fi
 
-  if [ "${typeart_to_stdout}" == 0 ]; then
-    local typeart_command_exe+=" -o ${out_file}"
+  if [ "${cusan_to_stdout}" == 0 ]; then
+    local cusan_command_exe+=" -o ${out_file}"
   fi
 
-  $typeart_command_exe "$@"
+  $cusan_command_exe "$@"
 }
 
 # shellcheck disable=SC2086
-function typeart_compile_cuda_device_fn() {
-  $typeart_compiler ${typeart_wrapper_more_args} -gdwarf-4 ${typeart_includes} ${typeart_to_llvm_flags} \
-    -fPIC -x cuda "${typeart_source_file}" --cuda-device-only --cuda-gpu-arch=${typeart_cuda_code_arch} -o - |
-    typeart_opt_fn "${out_basename}"_cusan_gpu_mem2reg.ll -mem2reg |
-    typeart_opt_fn "${out_basename}"_cusan_gpu.ll ${cusan_plugin} ${cusan_gpu_mode} |
-    typeart_opt_fn "${out_basename}"_cusan_gpu_opt.ll ${typeart_optimize} |
-    $typeart_llc_tool -dwarf-directory=0 -x=ir --relocation-model=pic -march=nvptx64 -mcpu=${typeart_cuda_code_arch} -mattr=+ptx64 \
+function cusan_compile_cuda_device_fn() {
+  if [ "@CUSAN_NEW_PM_REQUIRED@" == 1 ]; then
+    local -r mem_to_reg_pass="-passes=mem2reg"
+  else
+    local -r mem_to_reg_pass="-mem2reg"
+  fi
+
+  $cusan_compiler ${cusan_wrapper_more_args} -gdwarf-4 ${cusan_includes} ${cusan_to_llvm_flags} \
+    -fPIC -x cuda "${cusan_source_file}" --cuda-device-only --cuda-gpu-arch=${cusan_cuda_code_arch} -o - |
+    cusan_opt_fn "${out_basename}"_cusan_gpu_mem2reg.ll ${mem_to_reg_pass} |
+    cusan_opt_fn "${out_basename}"_cusan_gpu.ll ${cusan_plugin} ${cusan_gpu_mode} |
+    cusan_opt_fn "${out_basename}"_cusan_gpu_opt.ll ${cusan_optimize} |
+    $cusan_llc_tool -dwarf-directory=0 -x=ir --relocation-model=pic -march=nvptx64 -mcpu=${cusan_cuda_code_arch} -mattr=+ptx64 \
     -o "${out_basename}".ptx
 
-  ptxas -m64 --gpu-name=${typeart_cuda_code_arch} "${out_basename}".ptx -o "${out_basename}".ptx.o
+  ptxas -m64 --gpu-name=${cusan_cuda_code_arch} "${out_basename}".ptx -o "${out_basename}".ptx.o
 
-  fatbinary --64 --create "${out_basename}".fatbin --image=profile=${typeart_cuda_code_arch},file="${out_basename}".ptx.o \
-    --image=profile=${typeart_cuda_compute_arch},file="${out_basename}".ptx -link
+  fatbinary --64 --create "${out_basename}".fatbin --image=profile=${cusan_cuda_code_arch},file="${out_basename}".ptx.o \
+    --image=profile=${cusan_cuda_compute_arch},file="${out_basename}".ptx -link
 
-  #nvcc -gencode arch=${typeart_cuda_compute_arch},code=${typeart_cuda_code_arch} ${typeart_cuda_ldflags} \
+  #nvcc -gencode arch=${cusan_cuda_compute_arch},code=${cusan_cuda_code_arch} ${cusan_cuda_ldflags} \
   #  -dlink "${out_basename}".fatbin -o "${out_basename}"_dlink.o
 }
 
-function typeart_compile_host_fn() {
+function cusan_to_llvm_ir_fn() {
+  cusan_compiler_fn "${out_basename}"_base.ll -gdwarf-4 ${cusan_wrapper_more_args} ${typeart_includes} ${cusan_includes} ${cusan_san_flags} \
+    ${cusan_to_llvm_flags} ${cusan_to_llvm_more_flags} "${cusan_source_file}" -o -
+}
+
+function cusan_compile_with_cusan_host_fn() {
+  cusan_opt_fn "${out_basename}"_cusan_cpu.ll ${cusan_plugin} ${cusan_cpu_mode} |
+    cusan_opt_fn "${out_basename}"_opt.ll ${cusan_optimize}
+}
+
+function cusan_compile_host_fn() {
+  cusan_to_llvm_ir_fn |
+    cusan_compile_with_cusan_host_fn |
+    cusan_tu_out_fn
+}
+
+function cusan_compile_host_with_typeart_fn() {
   # shellcheck disable=SC2086
-  typeart_compiler_fn "${out_basename}"_base.ll -gdwarf-4 ${typeart_wrapper_more_args} ${typeart_includes} ${cusan_includes} ${typeart_san_flags} \
-    ${typeart_to_llvm_flags} ${typeart_to_llvm_more_flags} "${typeart_source_file}" -o - |
-    typeart_opt_fn "${out_basename}"_heap.ll ${typeart_plugin} ${typeart_heap_mode_args} ${typeart_cmdline_args_heap} |
-    typeart_opt_fn "${out_basename}"_cusan_cpu.ll ${cusan_plugin} ${cusan_cpu_mode} |
-    typeart_opt_fn "${out_basename}"_opt.ll ${typeart_optimize} |
-    typeart_opt_fn "${out_basename}"_stack.ll ${typeart_plugin} ${typeart_stack_mode_args} ${typeart_cmdline_args_stack} |
-    typeart_tu_out_fn
+  cusan_to_llvm_ir_fn |
+    cusan_opt_fn "${out_basename}"_heap.ll ${typeart_plugin} ${typeart_heap_mode_args} ${typeart_cmdline_args_heap} |
+    cusan_compile_with_cusan_host_fn |
+    cusan_opt_fn "${out_basename}"_stack.ll ${typeart_plugin} ${typeart_stack_mode_args} ${typeart_cmdline_args_stack} |
+    cusan_tu_out_fn
 }
 
-function typeart_main_compile_fn() {
-  if [ "${typeart_found_exe_file}" == 0 ]; then
-    local -r out_basename="${typeart_source_file%.*}"
+function cusan_main_compile_fn() {
+  if [ "${cusan_found_exe_file}" == 0 ]; then
+    local -r out_basename="${cusan_source_file%.*}"
   else
     # put temp files like .ptx, .fatbin to the location of the executable
-    local -r out_base_exe="${typeart_exe_file%/*}"
-    local -r out_basename_ext="${typeart_source_file##*/}"
+    local -r out_base_exe="${cusan_exe_file%/*}"
+    local -r out_basename_ext="${cusan_source_file##*/}"
     local -r out_basename=${out_base_exe}/"${out_basename_ext%.*}"
   fi
-  if [ -z "${typeart_object_file}" ]; then
-    # if no object file is specified, use filename(typeart_source_file).o
-    typeart_object_file="${out_basename}".o
+  if [ -z "${cusan_object_file}" ]; then
+    # if no object file is specified, use filename(cusan_source_file).o
+    cusan_object_file="${out_basename}".o
   fi
 
-  local -r cusan_is_installed=@TYPEART_RELOCATABLE@
+  local -r cusan_is_installed=@CUSAN_RELOCATABLE@
   if [ $cusan_is_installed == 0 ];then
     export CUSAN_KERNEL_DATA_FILE="${out_basename}".yaml
   fi
 
-  if [ "$typeart_found_cuda" == 1 ]; then
-    typeart_compile_cuda_device_fn
-    typeart_to_llvm_more_flags="$typeart_to_llvm_more_flags -fPIC -x cuda --cuda-host-only -Xclang -fcuda-include-gpubinary -Xclang ${out_basename}.fatbin"
+  if [ "$cusan_found_cuda" == 1 ]; then
+    cusan_compile_cuda_device_fn
+    cusan_to_llvm_more_flags="$cusan_to_llvm_more_flags -fPIC -x cuda --cuda-host-only -Xclang -fcuda-include-gpubinary -Xclang ${out_basename}.fatbin"
   fi
 
-  typeart_compile_host_fn
+  if [ "@CUSAN_WITH_TYPEART@" == 1 ]; then
+    cusan_compile_host_with_typeart_fn
+  else
+    cusan_compile_host_fn
+  fi
 }
 
-function typeart_main_driver_fn() {
-  typeart_global_init_fn
+function cusan_main_driver_fn() {
   cusan_global_init_fn
-  typeart_global_cuda_init_fn
+  cusan_global_cuda_init_fn
 
-  typeart_is_wrapper_disabled_fn
-  readonly typeart_disabled=$?
-  typeart_skip_fn "$@"
-  if [ "$?" == 1 ] || [ "$typeart_disabled" == 1 ]; then
+  cusan_is_wrapper_disabled_fn
+  readonly cusan_disabled=$?
+  cusan_skip_fn "$@"
+  if [ "$?" == 1 ] || [ "$cusan_disabled" == 1 ]; then
     # shellcheck disable=SC2068
-    $typeart_compiler $@
+    $cusan_compiler $@
     return 0
   fi
 
-  typeart_is_typeart_linking_fn "$@"
-  local -r typeart_linking=$?
-  typeart_has_source_fn "$@"
+  cusan_is_cusan_linking_fn "$@"
+  local -r cusan_linking=$?
+  cusan_has_source_fn "$@"
   local -r with_source=$?
 
-  if [ "$typeart_linking" == 1 ] && [ "$with_source" == 1 ]; then
-    typeart_parse_commands_fn "$@"
-    typeart_main_compile_fn "$@"
-    if [ "$typeart_found_exe_file" == 1 ]; then
-      typeart_wrapper_more_args+=" -o ${typeart_exe_file}"
+  if [ "$cusan_linking" == 1 ] && [ "$with_source" == 1 ]; then
+    cusan_parse_commands_fn "$@"
+    cusan_main_compile_fn "$@"
+    if [ "$cusan_found_exe_file" == 1 ]; then
+      cusan_wrapper_more_args+=" -o ${cusan_exe_file}"
     fi
-    typeart_main_link_fn "$typeart_wrapper_more_args" "${typeart_object_file}"
-    if [ -f "${typeart_object_file}" ]; then
-      rm "${typeart_object_file}"
+    cusan_main_link_fn "$cusan_wrapper_more_args" "${cusan_object_file}"
+    if [ -f "${cusan_object_file}" ]; then
+      rm "${cusan_object_file}"
     fi
-  elif [ "$typeart_linking" == 1 ]; then
-    typeart_main_link_fn "$@"
+  elif [ "$cusan_linking" == 1 ]; then
+    cusan_main_link_fn "$@"
   else
-    typeart_parse_commands_fn "$@"
-    typeart_main_compile_fn "$@"
+    cusan_parse_commands_fn "$@"
+    cusan_main_compile_fn "$@"
   fi
 }
 
-typeart_main_driver_fn "$@"
+cusan_main_driver_fn "$@"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 96cc0d7..34234fe 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -61,9 +61,9 @@ function(cusan_add_lit_testsuite target comment)
 endfunction()
 
 function(cusan_add_lit_target)
-  cmake_parse_arguments(ARG "" "" "SUITES" ${ARGN})
+  cmake_parse_arguments(ARG "" "" "SUITES;WORKERS" ${ARGN})
 
-  foreach(suite IN LISTS ARG_SUITES)
+  foreach(suite num_workers IN ZIP_LISTS ARG_SUITES ARG_WORKERS)
     if("${suite}" STREQUAL "all")
       set(SUITE_PATH ${CMAKE_CURRENT_SOURCE_DIR})
       set(TARGET_NAME check-cusan)
@@ -72,7 +72,7 @@ function(cusan_add_lit_target)
       set(TARGET_NAME check-cusan-${suite})
     endif()
 
-    if(NOT EXISTS ${SUITE_PATH})
+    if(NOT EXISTS ${SUITE_PATH} AND NOT ${suite} STREQUAL "staging")
       message(WARNING "Could not find suitable lit test target at ${SUITE_PATH}")
       continue()
     endif()
@@ -80,7 +80,7 @@ function(cusan_add_lit_target)
     cusan_add_lit_testsuite(${TARGET_NAME}
       "Running the lit suite cusan::${suite}"
       ${SUITE_PATH}
-      ARGS -v -j 1
+      ARGS -v -j ${num_workers}
       PARAMS cusan_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
       DEPENDS ${CUSAN_TEST_DEPENDS}
     )
@@ -92,17 +92,36 @@ set(CUSAN_TEST_DEPENDS
   cusan::MPI_Interceptor
   cusan::Runtime
   cusan::Analysis
-  typeart::Runtime
-  typeart::TransformPass
+  
 )
 
+if(CUSAN_TYPEART)
+list(APPEND CUSAN_TEST_DEPENDS typeart::Runtime typeart::TransformPass)
+endif()
+
 set(CUSAN_SUITES
   all
+  runtime
   pass
-  tsan
+  kernel_analysis
+  staging
+)
+
+include(ProcessorCount)
+ProcessorCount(NUM_CPU)
+if(NUM_CPU EQUAL 0)
+  set(NUM_CPU 1)
+endif()
+
+set(CUSAN_SUITES_WORKERS
+  1
+  1
+  ${NUM_CPU}
+  ${NUM_CPU}
+  1
 )
 
-cusan_add_lit_target(SUITES ${CUSAN_SUITES})
+cusan_add_lit_target(SUITES ${CUSAN_SUITES} WORKERS ${CUSAN_SUITES_WORKERS})
 
 add_test(
   NAME cusan-lit-suite
diff --git a/test/kernel_analysis/01_ptr_write.c b/test/kernel_analysis/01_ptr_write.c
new file mode 100644
index 0000000..ca1f79d
--- /dev/null
+++ b/test/kernel_analysis/01_ptr_write.c
@@ -0,0 +1,58 @@
+// clang-format off
+// RUN: %apply  %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s
+
+// clang-format on
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}} ptr: 1, rw: Write
+// CHECK-NOT: Handling Arg:
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int* data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data);
+  cudaEventRecord(endEvent);
+
+#ifdef CUSAN_SYNC
+  // Wait for the end event to complete (alternative to querying)
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/kernel_analysis/02_val_readwrite_ptr_read_ptr_write.c b/test/kernel_analysis/02_val_readwrite_ptr_read_ptr_write.c
new file mode 100644
index 0000000..28ea679
--- /dev/null
+++ b/test/kernel_analysis/02_val_readwrite_ptr_read_ptr_write.c
@@ -0,0 +1,47 @@
+// RUN: %apply  %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}} ptr: 0, rw: ReadWrite
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}} ptr: 1, rw: Read
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}} ptr: 1, rw: ReadWrite
+// CHECK-NOT: Handling Arg:
+
+#include <stdio.h>
+__device__ void axpy_write(float a, float* y) {
+  y[threadIdx.x] = a;
+}
+
+__global__ void axpy(float a, float* x, float* y) {
+  axpy_write(a * x[threadIdx.x], y);
+}
+
+int main(int argc, char* argv[]) {
+  const int kDataLen = 4;
+
+  float a                = 2.0f;
+  float host_x[kDataLen] = {1.0f, 2.0f, 3.0f, 4.0f};
+  float host_y[kDataLen];
+
+  float* device_x;
+  float* device_y;
+  cudaMalloc((void**)&device_x, kDataLen * sizeof(float));
+  cudaMalloc((void**)&device_y, kDataLen * sizeof(float));
+
+  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
+
+  axpy<<<1, kDataLen>>>(a, device_x, device_y);
+
+  cudaDeviceSynchronize();
+  cudaMemcpy(host_y, device_y, kDataLen * sizeof(float), cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < kDataLen; ++i) {
+    printf("y[%i] = %f\n", i, host_y[i]);
+  }
+
+  cudaDeviceReset();
+  return 0;
+}
diff --git a/test/kernel_analysis/03_struct_write.c b/test/kernel_analysis/03_struct_write.c
new file mode 100644
index 0000000..cde77e9
--- /dev/null
+++ b/test/kernel_analysis/03_struct_write.c
@@ -0,0 +1,101 @@
+// RUN: %apply  %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 0, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 1, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 1, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+// CHECK-NOT: Handling Arg:
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  int* buff2;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  if (world_rank == 0) {
+    kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(
+        buffStor, size);  // no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                            size);  // also no problem since they on same stream
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();
+#endif
+    // MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    MPI_Send(buffStor.buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    // MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    MPI_Recv(buffStor.buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+        buffStor, size);  // problem since different stream but same write target
+  }
+
+  cudaFree(buffStor.buff1);
+  cudaFree(buffStor.buff2);
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/kernel_analysis/04_struct_ptr.c b/test/kernel_analysis/04_struct_ptr.c
new file mode 100644
index 0000000..c37c4c2
--- /dev/null
+++ b/test/kernel_analysis/04_struct_ptr.c
@@ -0,0 +1,108 @@
+// RUN: %apply  %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[L, ], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:[L, [0, 0, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[L, ], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:[L, [0, 1, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[L, ], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:[L, [0, 1, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+// CHECK-NOT: Handling Arg:
+
+// XFAIL: *
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  int* buff2;
+};
+
+__global__ void kernel1(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff1[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff2[tid] = tid * 32;
+  }
+}
+__global__ void kernel3(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff2[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  BufferStorage* buffStor;
+  cudaHostAlloc(&buffStor, sizeof(BufferStorage), 0);
+  cudaMalloc(&buffStor->buff1, size * sizeof(int));
+  cudaMalloc(&buffStor->buff2, size * sizeof(int));
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  if (world_rank == 0) {
+    kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(
+        buffStor, size);  // no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                            size);  // also no problem since they on same stream
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();
+#endif
+    MPI_Send(buffStor->buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    // MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+
+  } else if (world_rank == 1) {
+    MPI_Recv(buffStor->buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    // MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+        buffStor, size);  // problem since different stream but same write target
+  }
+  cudaDeviceSynchronize();
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaFree(buffStor->buff1);
+  cudaFree(buffStor->buff2);
+  cudaFree(buffStor);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/kernel_analysis/05_struct_inside_of_struct.c b/test/kernel_analysis/05_struct_inside_of_struct.c
new file mode 100644
index 0000000..2b0dd0b
--- /dev/null
+++ b/test/kernel_analysis/05_struct_inside_of_struct.c
@@ -0,0 +1,86 @@
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:{{.}}[0, 0, ], [0, 0, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:{{.}}[0, 1, ], [0, 0, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}indices:{{.}}[0, 1, ], [0, 0, ], L, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK-NOT: Handling Arg:
+
+// XFAIL: *
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage2 {
+  int* buff;
+};
+
+struct BufferStorage {
+  BufferStorage2 buff1;
+  BufferStorage2 buff2;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1.buff[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2.buff[tid] = tid * 32;
+  }
+}
+
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2.buff[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1.buff, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2.buff, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);
+  kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+
+  cudaDeviceSynchronize();
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaFree(buffStor.buff1.buff);
+  cudaFree(buffStor.buff2.buff);
+  return 0;
+}
diff --git a/test/kernel_analysis/06_cuda_labmda.c b/test/kernel_analysis/06_cuda_labmda.c
new file mode 100644
index 0000000..1ec55fa
--- /dev/null
+++ b/test/kernel_analysis/06_cuda_labmda.c
@@ -0,0 +1,59 @@
+// RUN: %apply  %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: ReadWrite
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 0, ], ptr: 1, rw: Write
+// CHECK-NOT: Handling Arg:
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+template <typename F>
+__global__ void kernel_functor(F functor) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  functor(tid);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+  cudaDeviceSynchronize();
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  const auto lamba_kernel = [=] __host__ __device__(const int tid) { d_data[tid] = (tid + 1); };
+  kernel_functor<decltype(lamba_kernel)><<<blocksPerGrid, threadsPerBlock>>>(lamba_kernel);
+  cudaEventRecord(endEvent);
+
+#ifdef CUSAN_SYNC
+  // Wait for the end event to complete (alternative to querying)
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/kernel_analysis/07_negative_array.c b/test/kernel_analysis/07_negative_array.c
new file mode 100644
index 0000000..17017ae
--- /dev/null
+++ b/test/kernel_analysis/07_negative_array.c
@@ -0,0 +1,57 @@
+// clang-format off
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s
+
+// clang-format on
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}} ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[], ptr: 1, rw: Write
+// CHECK-NOT: Handling Arg:
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int** data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[-1][tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int** d_data;  // Unified Memory pointer
+  cudaMallocManaged(&d_data, 2 * sizeof(int*));
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data[0], size * sizeof(int));
+  cudaMallocManaged(&d_data[1], size * sizeof(int));
+  cudaMemset(d_data[0], 0, size * sizeof(int));
+  cudaMemset(d_data[1], 0, size * sizeof(int));
+
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(&d_data[1]);
+
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[0][i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaFree(d_data);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/test/kernel_analysis/08_big_struct_write.c b/test/kernel_analysis/08_big_struct_write.c
new file mode 100644
index 0000000..ccf6dbc
--- /dev/null
+++ b/test/kernel_analysis/08_big_struct_write.c
@@ -0,0 +1,104 @@
+// RUN: %apply %s  -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | \
+// RUN: %filecheck %s
+
+// CHECK-NOT: Handling Arg:
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 0, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 1, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+
+// CHECK: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}gep_indices:[], ptr: 1, rw: Read
+// CHECK-NEXT: subarg: {{.*}}, is_loading, gep_indices:[0, 1, ], ptr: 1, rw: Write
+// CHECK-NEXT: Handling Arg:
+// CHECK-NEXT: subarg: {{.*}}ptr: 0, rw: ReadWrite
+// CHECK-NOT: Handling Arg:
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  int* buff2;
+  int a;
+  int b;
+  float c;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  if (world_rank == 0) {
+    kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(
+        buffStor, size);  // no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                            size);  // also no problem since they on same stream
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();
+#endif
+    // MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    MPI_Send(buffStor.buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    // MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    MPI_Recv(buffStor.buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+        buffStor, size);  // problem since different stream but same write target
+  }
+
+  cudaFree(buffStor.buff1);
+  cudaFree(buffStor.buff2);
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/lit.cfg b/test/lit.cfg
index e624c6c..198aa41 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -30,7 +30,7 @@ cusan_script_dir = getattr(config, "cusan_script_dir", None)
 
 cusan_std_args = "-cusan"
 cusan_pass = "{}/{}".format(cusan_pass_dir, cusan_pass_name)
-to_llvm_args = "-O1 -g -Xclang -disable-llvm-passes -S -emit-llvm -o -"
+to_llvm_args = "-O0 -Xclang -disable-O0-optnone -g -c -emit-llvm"
 
 clang_cpp = getattr(config, "clang_cpp", "clang++")
 clang_cc = getattr(config, "clang", "clang")
@@ -65,7 +65,6 @@ config.substitutions.append(("%cpp-to-llvm", "{} {}".format(clang_cpp, to_llvm_a
 
 if cusan_script_dir is not None:
     config.substitutions.append(("%script_dir", cusan_script_dir))
-    config.substitutions.append(("%run", "{}/run.sh".format(cusan_script_dir)))
     config.substitutions.append(("%apply", "{}/apply.sh".format(cusan_script_dir)))
 
 config.substitutions.append(("%cusan_test_dir", config.test_exec_root))
@@ -78,4 +77,5 @@ config.substitutions.append(('%mpi-exec', '{}'.format(config.mpiexec)))
 
 config.substitutions.append(('%tsan-compile-flags', '-fsanitize=thread'))
 config.substitutions.append(('%tsan-options', 'TSAN_OPTIONS="exitcode=0 suppressions=%S/suppressions.txt"'))
-config.substitutions.append(('%cusan_ldpreload', 'LD_PRELOAD="{}"'.format(config.cusan_mpi_interceptor)))
\ No newline at end of file
+config.substitutions.append(('%cusan_ldpreload', 'LD_PRELOAD="{}"'.format(config.cusan_mpi_interceptor)))
+config.substitutions.append(('%clang_args', '-D__STRICT_ANSI__ -fPIC -O1 -g0'))
\ No newline at end of file
diff --git a/test/pass/01_test.c b/test/pass/01_test.c
index 6f24a02..218b740 100644
--- a/test/pass/01_test.c
+++ b/test/pass/01_test.c
@@ -1,14 +1,18 @@
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s
-
-// CHECK: TypeArtPass [Heap]
-// CHECK-NEXT: Malloc :   2
-
-// CHECK: cudaMemcpy
-// CHECK: _cusan_memcpy
-// CHECK: cudaDeviceSynchronize
-// CHECK: _cusan_sync_device
-// CHECK: cudaMemcpy
-// CHECK: _cusan_memcpy
+// clang-format off
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s
+// clang-format on
+
+// CHECK: {{(invoke|call)}} i32 @cudaMalloc
+// CHECK: {{(invoke|call)}} void @_cusan_device_alloc
+// CHECK: {{(invoke|call)}} i32 @cudaMalloc
+// CHECK: {{(invoke|call)}} void @_cusan_device_alloc
+// CHECK: {{(invoke|call)}} i32 @cudaMemcpy
+// CHECK: {{(invoke|call)}} void @_cusan_memcpy
+// CHECK: {{(invoke|call)}} i32 @cudaDeviceSynchronize
+// CHECK: {{(invoke|call)}} void @_cusan_sync_device
+// CHECK: {{(invoke|call)}} i32 @cudaMemcpy
+// CHECK: {{(invoke|call)}} void @_cusan_memcpy
+// CHECK: {{(invoke|call)}} i32 @cudaDeviceReset
 
 #include <stdio.h>
 __device__ void axpy_write(float a, float* y) {
diff --git a/test/pass/02_event.c b/test/pass/02_event.c
index 4c02cb7..ef8e7a5 100644
--- a/test/pass/02_event.c
+++ b/test/pass/02_event.c
@@ -1,24 +1,12 @@
 // clang-format off
-// RUN: %wrapper-cc %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cc %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
 
-// CHECK-LLVM-IR: invoke i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
 
 #include <cuda_runtime.h>
diff --git a/test/pass/03_cuda_to_mpi.c b/test/pass/03_cuda_to_mpi.c
index 60c08ef..9b16f0b 100644
--- a/test/pass/03_cuda_to_mpi.c
+++ b/test/pass/03_cuda_to_mpi.c
@@ -1,22 +1,11 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[target:%[0-9a-z]+]], i8* {{.*}}[[from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[target]], i8* {{.*}}[[from]],
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]],
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/04_mpi_to_cuda.c b/test/pass/04_mpi_to_cuda.c
index 375f90c..f5c905d 100644
--- a/test/pass/04_mpi_to_cuda.c
+++ b/test/pass/04_mpi_to_cuda.c
@@ -1,27 +1,14 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[target:%[0-9a-z]+]], i8* {{.*}}[[from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[target]], i8* {{.*}}[[from]],
-
-// FLAKYPASS: *
-// ALLOW_RETRIES: 5
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]],
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/05_cuda_to_mpi_stream.c b/test/pass/05_cuda_to_mpi_stream.c
index fd41911..99e40af 100644
--- a/test/pass/05_cuda_to_mpi_stream.c
+++ b/test/pass/05_cuda_to_mpi_stream.c
@@ -1,25 +1,13 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 
 // CHECK-LLVM-IR: cudaMemcpyAsync
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async
-// CHECK-LLVM-IR: invoke i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
 #include "../support/gpu_mpi.h"
@@ -28,7 +16,7 @@ __global__ void kernel(int* arr, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
 #if __CUDA_ARCH__ >= 700
-    for (int i = 0; i < tid; i++) {
+    for (int i = 0; i < tid; i++ % apply % s - strip - debug) {
       __nanosleep(1000000U);
     }
 #else
diff --git a/test/pass/06_cuda_to_mpi_event.c b/test/pass/06_cuda_to_mpi_event.c
index 954bbdb..d932fa7 100644
--- a/test/pass/06_cuda_to_mpi_event.c
+++ b/test/pass/06_cuda_to_mpi_event.c
@@ -1,26 +1,15 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
 
-// CHECK-LLVM-IR: invoke i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[target:%[0-9a-z]+]], i8* {{.*}}[[from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[target]], i8* {{.*}}[[from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]],
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/07_cuda_to_mpi_read.c b/test/pass/07_cuda_to_mpi_read.c
index 302ca8b..1bf3b4a 100644
--- a/test/pass/07_cuda_to_mpi_read.c
+++ b/test/pass/07_cuda_to_mpi_read.c
@@ -1,20 +1,15 @@
 // clang-format off
-// RUN: %wrapper-mpicxx -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
 
-// CHECK-LLVM-IR: invoke i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[target:%[0-9a-z]+]], i8* {{.*}}[[from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[target]], i8* {{.*}}[[from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]],
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/08_cudamemcpy_to_mpi.c b/test/pass/08_cudamemcpy_to_mpi.c
index 410b44e..ed3b4fa 100644
--- a/test/pass/08_cudamemcpy_to_mpi.c
+++ b/test/pass/08_cudamemcpy_to_mpi.c
@@ -1,23 +1,19 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
-// CHECK-LLVM-IR: invoke i32 @cudaMemset(i8* {{.*}}[[mset_target:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset(i8* {{.*}}[[mset_target]],
-// CHECK-LLVM-IR: invoke i32 @cudaDeviceSynchronize 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset({{i8\*|ptr}} {{.*}}[[mset_target:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset({{i8\*|ptr}} {{.*}}[[mset_target]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize 
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device 
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[mcpy_target:%[0-9a-z]+]], i8* {{.*}}[[mcpy_from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[mcpy_target]], i8* {{.*}}[[mcpy_from]],
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpyAsync(i8* {{.*}}[[mcpyasy_target:%[0-9a-z]+]], i8* {{.*}}[[mcpyasy_from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async(i8* {{.*}}[[mcpyasy_target]], i8* {{.*}}[[mcpyasy_from]],
-// CHECK-LLVM-IR: invoke i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[mcpy_target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[mcpy_from:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[mcpy_target]], {{i8\*|ptr}} {{.*}}[[mcpy_from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpyAsync({{i8\*|ptr}} {{.*}}[[mcpyasy_target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[mcpyasy_from:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async({{i8\*|ptr}} {{.*}}[[mcpyasy_target]], {{i8\*|ptr}} {{.*}}[[mcpyasy_from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
 
 // clang-format on
diff --git a/test/pass/09_cudamemcpy_default.c b/test/pass/09_cudamemcpy_default.c
index 24d4f2f..ea3796d 100644
--- a/test/pass/09_cudamemcpy_default.c
+++ b/test/pass/09_cudamemcpy_default.c
@@ -1,20 +1,15 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
-
-// CHECK-LLVM-IR: @main(i32 noundef %0, i8** noundef %1)
-// CHECK-LLVM-IR: invoke i32 @cudaHostRegister(i8* {{.*}}[[unregister_ptr:%[0-9a-z]+]]
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_host_register(i8* {{.*}}[[unregister_ptr]]
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[target:%[0-9a-z]+]], i8* {{.*}}[[from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[target]], i8* {{.*}}[[from]],
-// CHECK-LLVM-IR: invoke i32 @cudaHostUnregister(i8* {{.*}}[[unregister_ptr:%[0-9a-z]+]]
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_host_unregister(i8* {{.*}}[[unregister_ptr]]
+// CHECK-LLVM-IR: @main(i32 noundef %0, {{i8\*\*|ptr}} noundef %1)
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaHostRegister({{i8\*|ptr}} {{.*}}[[unregister_ptr:%[0-9a-z]+]]
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_host_register({{i8\*|ptr}} {{.*}}[[unregister_ptr]]
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]], CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaHostUnregister({{i8\*|ptr}}
+// {{.*}}[[unregister_ptr:%[0-9a-z]+]] CHECK-LLVM-IR: {{call|invoke}} void @_cusan_host_unregister({{i8\*|ptr}}
+// {{.*}}[[unregister_ptr]]
 
 #include <cuda_runtime.h>
 #include <stdio.h>
diff --git a/test/pass/10_cudahostalloc.c b/test/pass/10_cudahostalloc.c
index dc9ab40..269fe60 100644
--- a/test/pass/10_cudahostalloc.c
+++ b/test/pass/10_cudahostalloc.c
@@ -1,25 +1,20 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
-
-// CHECK-LLVM-IR: @main(i32 noundef %0, i8** noundef %1)
-// CHECK-LLVM-IR: invoke i32 @cudaMallocHost
-// CHECK-LLVM-IR: call void @_cusan_host_alloc
-// CHECK-LLVM-IR: invoke noundef i32 @_ZL13cudaHostAllocIiE9cudaErrorPPT_mj
-// CHECK-LLVM-IR: invoke i32 @cudaFreeHost({{.*}}[[free_ptr1:%[0-9a-z]+]])
-// CHECK-LLVM-IR: call void @_cusan_host_free({{.*}}[[free_ptr1]])
-// CHECK-LLVM-IR: invoke i32 @cudaFreeHost({{.*}}[[free_ptr2:%[0-9a-z]+]])
-// CHECK-LLVM-IR: call void @_cusan_host_free({{.*}}[[free_ptr2]])
+// CHECK-LLVM-IR: @main(i32 noundef %0, {{i8\*\*|ptr}} noundef %1)
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMallocHost
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_host_alloc
+// CHECK-LLVM-IR: {{(call|invoke)}} noundef i32 @_ZL13cudaHostAllocIiE9cudaErrorPPT_mj
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFreeHost({{.*}}[[free_ptr1:%[0-9a-z]+]])
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_host_free({{.*}}[[free_ptr1]])
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFreeHost({{.*}}[[free_ptr2:%[0-9a-z]+]])
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_host_free({{.*}}[[free_ptr2]])
 
 // CHECK-LLVM-IR: _ZL13cudaHostAllocIiE9cudaErrorPPT_mj
-// CHECK-LLVM-IR: invoke i32 @cudaHostAlloc({{.*}}[[host_alloc_ptr:%[0-9a-z]+]])
-// CHECK-LLVM-IR: call void @_cusan_host_alloc({{.*}}[[host_alloc_ptr]])
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaHostAlloc({{.*}}[[host_alloc_ptr:%[0-9a-z]+]])
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_host_alloc({{.*}}[[host_alloc_ptr]])
 
 #include <cuda_runtime.h>
 #include <stdio.h>
diff --git a/test/pass/11_cuda_to_mpi_struct_of_buff.c b/test/pass/11_cuda_to_mpi_struct_of_buff.c
index c84cf6d..1da1115 100644
--- a/test/pass/11_cuda_to_mpi_struct_of_buff.c
+++ b/test/pass/11_cuda_to_mpi_struct_of_buff.c
@@ -1,45 +1,45 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
 
-// UN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
-
-// CHECK-DAG: data race
-
-// CHECK-SYNC-NOT: data race
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
 
 #include "../support/gpu_mpi.h"
 
-struct BufferStorage{
+struct BufferStorage {
   int* buff1;
   int* buff2;
 };
 
-
 __global__ void kernel1(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff1[tid] = tid*32;
+    storage.buff1[tid] = tid * 32;
   }
 }
 __global__ void kernel2(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2[tid] = tid*32;
+    storage.buff2[tid] = tid * 32;
   }
 }
 __global__ void kernel3(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2[tid] = tid*32;
+    storage.buff2[tid] = tid * 32;
   }
 }
 
-
-
 int main(int argc, char* argv[]) {
   if (!has_gpu_aware_mpi()) {
     printf("This example is designed for CUDA-aware MPI. Exiting.\n");
@@ -67,19 +67,22 @@ int main(int argc, char* argv[]) {
   cudaStream_t stream1, stream2;
   cudaStreamCreate(&stream1);
   cudaStreamCreate(&stream2);
-  if(world_rank == 0){
+  if (world_rank == 0) {
     kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
-    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//no problem since kernel 1 and 3 write to different
-    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//also no problem since they on same stream
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(
+        buffStor, size);  // no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                            size);  // also no problem since they on same stream
 #ifdef CUSAN_SYNC
     cudaDeviceSynchronize();
 #endif
-    //MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    // MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
     MPI_Send(buffStor.buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
-  }else if (world_rank == 1){
-    //MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  } else if (world_rank == 1) {
+    // MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
     MPI_Recv(buffStor.buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);//problem since different stream but same write target
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+        buffStor, size);  // problem since different stream but same write target
   }
 
   cudaFree(buffStor.buff1);
diff --git a/test/pass/11_struct_of_buff.c b/test/pass/11_struct_of_buff.c
index 311bc63..97a7b7e 100644
--- a/test/pass/11_struct_of_buff.c
+++ b/test/pass/11_struct_of_buff.c
@@ -1,45 +1,46 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// UN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
-
-// CHECK-DAG: data race
-
-// CHECK-SYNC-NOT: data race
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
+
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
-struct BufferStorage{
+struct BufferStorage {
   int* buff1;
   int* buff2;
 };
 
-
 __global__ void kernel1(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff1[tid] = tid*32;
+    storage.buff1[tid] = tid * 32;
   }
 }
 __global__ void kernel2(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2[tid] = tid*32;
+    storage.buff2[tid] = tid * 32;
   }
 }
 __global__ void kernel3(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2[tid] = tid*32;
+    storage.buff2[tid] = tid * 32;
   }
 }
 
-
-
 int main(int argc, char* argv[]) {
   const int size            = 512;
   const int threadsPerBlock = size;
@@ -54,17 +55,19 @@ int main(int argc, char* argv[]) {
   cudaStreamCreate(&stream2);
 
   kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
-  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//no problem since kernel 1 and 3 write to different
-  kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//also no problem since they on same stream
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                          size);  // no problem since kernel 1 and 3 write to different
+  kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);  // also no problem since they on same stream
 #ifdef CUSAN_SYNC
   cudaDeviceSynchronize();
 #endif
-  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);//problem since different stream but same write target
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+      buffStor, size);  // problem since different stream but same write target
 
   cudaDeviceSynchronize();
 
-  cudaStreamDestroy ( stream2 );
-  cudaStreamDestroy ( stream1 );
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
   cudaFree(buffStor.buff1);
   cudaFree(buffStor.buff2);
   return 0;
diff --git a/test/pass/12_struct_ptr.c b/test/pass/12_struct_ptr.c
index 6f887a2..0c13873 100644
--- a/test/pass/12_struct_ptr.c
+++ b/test/pass/12_struct_ptr.c
@@ -1,33 +1,24 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
 
-// CHECK-DAG: data race
-
-// CHECK-SYNC-NOT: data race
-
-
-// CHECK-LLVM-IR: @main(i32 noundef %0, i8** noundef %1)
-// CHECK-LLVM-IR: invoke i32 @cudaMalloc
-// CHECK-LLVM-IR: call void @_cusan_device_alloc
-// CHECK-LLVM-IR: invoke i32 @cudaMalloc
-// CHECK-LLVM-IR: call void @_cusan_device_alloc
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
+// CHECK-LLVM-IR: @main(i32 noundef %0, {{i8\*\*|ptr}} noundef %1)
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMalloc
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_alloc
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMalloc
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_alloc
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamDestroy
-// CHECK-LLVM-IR: invoke i32 @cudaStreamDestroy
-// CHECK-LLVM-IR: invoke i32 @cudaFree({{.*}}[[free_ptr1:%[0-9a-z]+]])
-// CHECK-LLVM-IR: call void @_cusan_device_free({{.*}}[[free_ptr1]])
-// CHECK-LLVM-IR: invoke i32 @cudaFree({{.*}}[[free_ptr2:%[0-9a-z]+]])
-// CHECK-LLVM-IR: call void @_cusan_device_free({{.*}}[[free_ptr2]])
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree({{.*}}[[free_ptr1:%[0-9a-z]+]])
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free({{.*}}[[free_ptr1]])
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree({{.*}}[[free_ptr2:%[0-9a-z]+]])
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free({{.*}}[[free_ptr2]])
 
 
 #include "../support/gpu_mpi.h"
diff --git a/test/pass/13_struct_recursion.c b/test/pass/13_struct_recursion.c
index 735ade6..08d85cb 100644
--- a/test/pass/13_struct_recursion.c
+++ b/test/pass/13_struct_recursion.c
@@ -1,52 +1,54 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
 
-// UN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
-
-
-// CHECK-DAG: data race
-
-// CHECK-SYNC-NOT: data race
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
 
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
-struct BufferStorage2{
+struct BufferStorage2 {
   int* buff;
 };
 
-struct BufferStorage{
+struct BufferStorage {
   BufferStorage2 buff1;
   BufferStorage2 buff2;
 };
 
-
 __global__ void kernel1(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff1.buff[tid] = tid*32;
+    storage.buff1.buff[tid] = tid * 32;
   }
 }
 __global__ void kernel2(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2.buff[tid] = tid*32;
+    storage.buff2.buff[tid] = tid * 32;
   }
 }
 
 __global__ void kernel3(BufferStorage storage, const int N) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < N) {
-    storage.buff2.buff[tid] = tid*32;
+    storage.buff2.buff[tid] = tid * 32;
   }
 }
 
-
 int main(int argc, char* argv[]) {
   const int size            = 512;
   const int threadsPerBlock = size;
@@ -70,8 +72,8 @@ int main(int argc, char* argv[]) {
 
   cudaDeviceSynchronize();
 
-  cudaStreamDestroy ( stream2 );
-  cudaStreamDestroy ( stream1 );
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
   cudaFree(buffStor.buff1.buff);
   cudaFree(buffStor.buff2.buff);
   return 0;
diff --git a/test/pass/14_cuda_functor.c b/test/pass/14_cuda_functor.c
index f26b431..cf1858d 100644
--- a/test/pass/14_cuda_functor.c
+++ b/test/pass/14_cuda_functor.c
@@ -1,24 +1,19 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
-
-// CHECK-LLVM-IR: invoke i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
 
-// CHECK-LLVM-IR: invoke i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
 // CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/15_cuda_memset_sync.c b/test/pass/15_cuda_memset_sync.c
index fee85de..a5ed35d 100644
--- a/test/pass/15_cuda_memset_sync.c
+++ b/test/pass/15_cuda_memset_sync.c
@@ -1,17 +1,22 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/15_cuda_memset_sync_nonblocking.c b/test/pass/15_cuda_memset_sync_nonblocking.c
index 22571bd..40b4bb5 100644
--- a/test/pass/15_cuda_memset_sync_nonblocking.c
+++ b/test/pass/15_cuda_memset_sync_nonblocking.c
@@ -1,17 +1,23 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreateWithFlags
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/17_cuda_stream_query_busy_loop.c b/test/pass/17_cuda_stream_query_busy_loop.c
index 25f04ca..90043e4 100644
--- a/test/pass/17_cuda_stream_query_busy_loop.c
+++ b/test/pass/17_cuda_stream_query_busy_loop.c
@@ -1,19 +1,16 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// UN: %apply %s -DCUSAN_SYNC --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/18_cuda_event_query_busy_loop.c b/test/pass/18_cuda_event_query_busy_loop.c
index 286d201..f24cf7f 100644
--- a/test/pass/18_cuda_event_query_busy_loop.c
+++ b/test/pass/18_cuda_event_query_busy_loop.c
@@ -1,19 +1,20 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// UN: %apply %s -DCUSAN_SYNC --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_event
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_event_record
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/18_cuda_event_with_flags_query_busy_loop.c b/test/pass/18_cuda_event_with_flags_query_busy_loop.c
new file mode 100644
index 0000000..eec2651
--- /dev/null
+++ b/test/pass/18_cuda_event_with_flags_query_busy_loop.c
@@ -0,0 +1,67 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreateWithFlags
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_event
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_event_record
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaEvent_t event1;
+  cudaEventCreateWithFlags(&event1, cudaEventBlockingSync);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+  cudaEventRecord(event1, stream1);
+
+#ifdef CUSAN_SYNC
+  while (cudaEventQuery(event1) != cudaSuccess) {
+  }
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      // break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaStreamDestroy(stream1);
+  return 0;
+}
diff --git a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
index 3caa2bf..76ddf1d 100644
--- a/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
+++ b/test/pass/18_cuda_to_mpi_event_query_busy_loop.c
@@ -1,19 +1,21 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
-
-// UN: %apply %s -DCUSAN_SYNC --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_event
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_event_record
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventDestroy
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
index 8596a24..0088b7b 100644
--- a/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -1,17 +1,24 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
-
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpyAsync
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy_async
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c b/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
index 004395c..f7edfbd 100644
--- a/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
+++ b/test/pass/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
@@ -1,17 +1,27 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreateWithFlags
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreateWithFlags
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpyAsync
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy_async
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
index ad5592c..9b29b17 100644
--- a/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
+++ b/test/pass/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -1,17 +1,22 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpyAsync
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy_async
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/20_cuda_default_stream_sync.c b/test/pass/20_cuda_default_stream_sync.c
index 4e4cc4d..d7dbbf5 100644
--- a/test/pass/20_cuda_default_stream_sync.c
+++ b/test/pass/20_cuda_default_stream_sync.c
@@ -1,16 +1,20 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
index f3b02fc..1aaf301 100644
--- a/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
+++ b/test/pass/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -1,16 +1,18 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 // FLAKYPASS: *
 // ALLOW_RETRIES: 5
diff --git a/test/pass/21_chunked_streams_example.c b/test/pass/21_chunked_streams_example.c
index 574d370..92993cb 100644
--- a/test/pass/21_chunked_streams_example.c
+++ b/test/pass/21_chunked_streams_example.c
@@ -1,16 +1,24 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// UN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// UN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
-
-// HECK-SYNC-NOT: data race
-// HECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpyAsync
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpy
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
 
 #include <cstdio>
 #include <cuda_runtime.h>
diff --git a/test/pass/22_cuda_to_mpi_partial_buff_write.c b/test/pass/22_cuda_to_mpi_partial_buff_write.c
index c1cf424..b210fc2 100644
--- a/test/pass/22_cuda_to_mpi_partial_buff_write.c
+++ b/test/pass/22_cuda_to_mpi_partial_buff_write.c
@@ -1,10 +1,21 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-NOT: data race
-// CHECK-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemcpyAsync
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
@@ -31,7 +42,7 @@ int main(int argc, char* argv[]) {
   const int size            = 512;
   const int threadsPerBlock = size;
   const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
-  static_assert(size % 2 == 0, "Needs to be divisble by 2");
+  static_assert(size % 2 == 0, "Needs to be divisible by 2");
   const int half_size = size / 2;
 
   MPI_Init(&argc, &argv);
diff --git a/test/pass/23_cuda_default_stream_post_sync.c b/test/pass/23_cuda_default_stream_post_sync.c
index bf654a3..4bf660d 100644
--- a/test/pass/23_cuda_default_stream_post_sync.c
+++ b/test/pass/23_cuda_default_stream_post_sync.c
@@ -1,16 +1,19 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/23_cuda_default_stream_post_sync_nonblocking.c b/test/pass/23_cuda_default_stream_post_sync_nonblocking.c
index 2a5b817..336de57 100644
--- a/test/pass/23_cuda_default_stream_post_sync_nonblocking.c
+++ b/test/pass/23_cuda_default_stream_post_sync_nonblocking.c
@@ -1,16 +1,21 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreateWithFlags
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/24_cuda_sync_stream_default_nonblocking.c b/test/pass/24_cuda_sync_stream_default_nonblocking.c
index 17247fd..ee107f0 100644
--- a/test/pass/24_cuda_sync_stream_default_nonblocking.c
+++ b/test/pass/24_cuda_sync_stream_default_nonblocking.c
@@ -1,16 +1,17 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreateWithFlags
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/25_cuda_default_stream_double_sync.c b/test/pass/25_cuda_default_stream_double_sync.c
index e8a5132..5d0e352 100644
--- a/test/pass/25_cuda_default_stream_double_sync.c
+++ b/test/pass/25_cuda_default_stream_double_sync.c
@@ -1,16 +1,20 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
-// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
 // clang-format on
 
-// CHECK-DAG: data race
-// CHECK-DAG: [Error] sync
-
-// CHECK-SYNC-NOT: data race
-// CHECK-SYNC-NOT: [Error] sync
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaMemset
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memset
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{call|invoke}} i32 @cudaFree
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_device_free
 
 #include "../support/gpu_mpi.h"
 
diff --git a/test/pass/26_malloc_pitch.c b/test/pass/26_malloc_pitch.c
new file mode 100644
index 0000000..1eb7b92
--- /dev/null
+++ b/test/pass/26_malloc_pitch.c
@@ -0,0 +1,86 @@
+// clang-format off
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[target:%[0-9a-z]+]], {{i8\*|ptr}}
+// {{.*}}[[from:%[0-9a-z]+]], CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[target]],
+// {{i8\*|ptr}} {{.*}}[[from]],
+
+#include "../support/gpu_mpi.h"
+
+#include <assert.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int width  = 4;
+  const int height = 8;
+
+  int* d_data;
+  size_t pitch;
+  cudaMallocPitch(&d_data, &pitch, width * sizeof(int), height);
+
+  size_t true_buffer_size = pitch * height;
+  size_t true_n_elements  = true_buffer_size / sizeof(int);
+  // printf("%zu %zu %zu\n", true_buffer_size, true_n_elements, pitch);
+  assert(true_buffer_size % sizeof(int) == 0);
+
+  const int threadsPerBlock = true_n_elements;
+  const int blocksPerGrid   = (true_n_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, true_n_elements);
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, true_n_elements, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, true_n_elements, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(true_buffer_size);
+    cudaMemcpy(h_data, d_data, true_buffer_size, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < true_n_elements; i++) {
+      const int buf_v = h_data[i];
+      // printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/pass/28_cuda_memset2d_implicit_syn.c b/test/pass/28_cuda_memset2d_implicit_syn.c
new file mode 100644
index 0000000..de6259a
--- /dev/null
+++ b/test/pass/28_cuda_memset2d_implicit_syn.c
@@ -0,0 +1,103 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_kernel_register
+// CHECK-LLVM-IR: {{(call|invoke)}} noundef i32 @cudaLaunchKernel
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreateWithPriority
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemset2D
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memset_2d
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaDeviceSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_device
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy2DAsync
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy_2d_async
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_sync_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMallocPitch
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_alloc
+
+#include "../support/gpu_mpi.h"
+
+#include <assert.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(99000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int width  = 64;
+  const int height = 8;
+
+  int* d_data;
+  size_t pitch;
+  // allocations
+  cudaMallocPitch(&d_data, &pitch, width * sizeof(int), height);
+
+  int* dummy_d_data;
+  size_t dummy_pitch;
+  cudaMallocPitch(&dummy_d_data, &dummy_pitch, width * sizeof(int), height);
+  int* h_data = (int*)malloc(width * sizeof(int) * height);
+
+  size_t true_buffer_size = pitch * height;
+  size_t true_n_elements  = true_buffer_size / sizeof(int);
+  assert(true_buffer_size % sizeof(int) == 0);
+  const int threadsPerBlock = true_n_elements;
+  const int blocksPerGrid   = (true_n_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaStream_t stream2;
+  cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, -1);
+
+  // null out all the data
+  cudaMemset2D(d_data, pitch, 0, width, height);
+  memset(h_data, 0, width * sizeof(int) * height);
+  cudaDeviceSynchronize();
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, true_n_elements);
+
+#ifdef CUSAN_SYNC
+  // copy into dummy data buffer causing implicit sync
+  cudaMemset2D(dummy_d_data, dummy_pitch, 0, width, height);
+#endif
+
+  // do async non blocking copy which will fail if there was no sync between this and the writing kernel
+  cudaMemcpy2DAsync(h_data, width * sizeof(int), d_data, pitch, width * sizeof(int), height, cudaMemcpyDeviceToHost,
+                    stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < width * height; i++) {
+    const int buf_v = h_data[i];
+    // printf("buf[%d] = %d\n", i, buf_v);
+    if (buf_v == 0) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  free(h_data);
+  cudaFree(d_data);
+  cudaFree(dummy_d_data);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  return 0;
+}
diff --git a/test/pass/29_tsan_cuda_to_mpi.c b/test/pass/29_tsan_cuda_to_mpi.c
new file mode 100644
index 0000000..6be0bae
--- /dev/null
+++ b/test/pass/29_tsan_cuda_to_mpi.c
@@ -0,0 +1,81 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/pass/30_tsan_annotate_cuda_to_mpi.c b/test/pass/30_tsan_annotate_cuda_to_mpi.c
new file mode 100644
index 0000000..d3f18d4
--- /dev/null
+++ b/test/pass/30_tsan_annotate_cuda_to_mpi.c
@@ -0,0 +1,83 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_memcpy
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaFree
+// CHECK-LLVM-IR: {{(call|invoke)}} void @_cusan_device_free
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();  // FIXME: uncomment for correct execution
+#endif
+
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
+
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/pass/31_tsan_cuda_event.c b/test/pass/31_tsan_cuda_event.c
new file mode 100644
index 0000000..c0a31c6
--- /dev/null
+++ b/test/pass/31_tsan_cuda_event.c
@@ -0,0 +1,60 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+
+// clang-format on
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void kernel(int* arr, const int N) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    arr[tid] = arr[tid] + 1;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+  cudaEvent_t first_finished_event;
+  cudaEventCreate(&first_finished_event);
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, size);
+  cudaEventRecord(first_finished_event, stream1);
+
+#ifdef CUSAN_SYNC
+  cudaEventSynchronize(first_finished_event);
+#endif
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(d_data, size);
+
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  cudaEventDestroy(first_finished_event);
+  cudaFree(d_data);
+  return 0;
+}
diff --git a/test/pass/32_tsan_async_copy.c b/test/pass/32_tsan_async_copy.c
new file mode 100644
index 0000000..a044405
--- /dev/null
+++ b/test/pass/32_tsan_async_copy.c
@@ -0,0 +1,77 @@
+// clang-format off
+
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate 
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate 
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpyAsync({{i8\*|ptr}} {{.*}}[[mcpyasy_target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[mcpyasy_from:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async({{i8\*|ptr}} {{.*}}[[mcpyasy_target]], {{i8\*|ptr}} {{.*}}[[mcpyasy_from]], 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamDestroy
+
+// clang-format on
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = tid + 1;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  cudaEvent_t first_finished_event;
+  cudaEventCreate(&first_finished_event);
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* h_data = (int*)malloc(size * sizeof(int));
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, size);
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(stream1);
+#endif
+  cudaMemcpyAsync(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost, stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < size; i++) {
+    const int buf_v = h_data[i];
+    if (buf_v == 0) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+  free(h_data);
+
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  cudaEventDestroy(first_finished_event);
+  cudaFree(d_data);
+  return 0;
+}
diff --git a/test/pass/33_tsan_wait_event.c b/test/pass/33_tsan_wait_event.c
new file mode 100644
index 0000000..34d3b02
--- /dev/null
+++ b/test/pass/33_tsan_wait_event.c
@@ -0,0 +1,101 @@
+// clang-format off
+// RUN: %apply %s -strip-debug --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
+
+
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[mcpy_target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[mcpy_from:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[mcpy_target]], {{i8\*|ptr}} {{.*}}[[mcpy_from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventCreate
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaEventRecord
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaMemcpy({{i8\*|ptr}} {{.*}}[[mcpy2_target:%[0-9a-z]+]], {{i8\*|ptr}} {{.*}}[[mcpy2_from:%[0-9a-z]+]],
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy({{i8\*|ptr}} {{.*}}[[mcpy2_target]], {{i8\*|ptr}} {{.*}}[[mcpy2_from]],
+// CHECK-LLVM-IR: {{(call|invoke)}} i32 @cudaStreamSynchronize
+// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
+
+// clang-format on
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void writing_kernel(float* arr, const int N, float value) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (float)tid + value;
+  }
+}
+
+__global__ void reading_kernel(float* res, const float* read, const int N,
+                               float value) {  // CHECK-DAG: [[FILENAME]]:[[@LINE]]
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    res[tid] = read[tid] + value;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  float* h_data = (float*)malloc(size * sizeof(float));
+  memset(h_data, 0, size * sizeof(float));
+  // Allocate device memory
+  float* d_data;
+  float* res_data;
+  cudaMalloc(&res_data, size * sizeof(float));
+  cudaMalloc(&d_data, size * sizeof(float));
+
+  // Copy host memory to device
+  cudaMemcpy(d_data, h_data, size * sizeof(float), cudaMemcpyHostToDevice);
+
+  cudaDeviceSynchronize();
+  // Create CUDA streams
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  // Create an event
+  cudaEvent_t event;
+  cudaEventCreate(&event);
+  // Launch first kernel in stream1
+  writing_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, size, 5.0f);
+
+  // Record event after kernel in stream1
+  cudaEventRecord(event, stream1);
+  // Make stream2 wait for the event
+#ifdef CUSAN_SYNC
+  cudaStreamWaitEvent(stream2, event, 0);
+#endif
+
+  // Launch second kernel in stream2
+  reading_kernel<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(res_data, d_data, size, 10.0f);
+
+  // Copy data back to host
+  cudaMemcpy(h_data, d_data, size * sizeof(float), cudaMemcpyDeviceToHost);
+
+  // Wait for stream2 to finish
+  cudaStreamSynchronize(stream2);
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaEventDestroy(event);
+  cudaFree(d_data);
+  free(h_data);
+  return 0;
+}
diff --git a/test/pass/TSan_External.h b/test/pass/TSan_External.h
index 36849c9..574f0db 100644
--- a/test/pass/TSan_External.h
+++ b/test/pass/TSan_External.h
@@ -25,16 +25,16 @@ typedef unsigned long long a64;
 #ifdef MUST_DEBUG
 // Print an error message *once* if an annotation function is used that is not overwritten by the
 // TSan runtime
-#define FALLBACK_PRINT(func_name)                                                  \
-  {                                                                                \
-    static bool once = false;                                                      \
-    if (!once) {                                                                   \
-      printf(                                                                      \
-          "[MUST-ERROR] %s fallback called, check your TSan runtime and the call " \
-          "signature\n",                                                           \
-          func_name);                                                              \
-      once = true;                                                                 \
-    }                                                                              \
+#define FALLBACK_PRINT(func_name)                                                              \
+  {                                                                                            \
+    static bool once = false;                                                                  \
+    if (!once) {                                                                               \
+      printf(                                                                                  \
+          "[MUST-ERROR] %s fallback called, check your TSan runtime and the {{(call|invoke)}}" \
+          "signature\n",                                                                       \
+          func_name);                                                                          \
+      once = true;                                                                             \
+    }                                                                                          \
   }
 #else
 #define FALLBACK_PRINT(func_name)
diff --git a/test/runtime/02_event.c b/test/runtime/02_event.c
new file mode 100644
index 0000000..dc8e17a
--- /dev/null
+++ b/test/runtime/02_event.c
@@ -0,0 +1,63 @@
+// clang-format off
+// RUN: %wrapper-cc %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cc %clang_args -DCUSAN_SYNC %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int* data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data);
+  cudaEventRecord(endEvent);
+
+#ifdef CUSAN_SYNC
+  // Wait for the end event to complete (alternative to querying)
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/runtime/03_cuda_to_mpi.c b/test/runtime/03_cuda_to_mpi.c
new file mode 100644
index 0000000..d578e1e
--- /dev/null
+++ b/test/runtime/03_cuda_to_mpi.c
@@ -0,0 +1,83 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/04_mpi_to_cuda.c b/test/runtime/04_mpi_to_cuda.c
new file mode 100644
index 0000000..fc0f890
--- /dev/null
+++ b/test/runtime/04_mpi_to_cuda.c
@@ -0,0 +1,109 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+// FLAKYPASS: *
+// ALLOW_RETRIES: 5
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel_init(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    arr[tid] = -(tid + 1);
+  }
+}
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(100U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    kernel_init<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+    cudaDeviceSynchronize();
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  if (world_rank == 0) {
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Request request;
+    // Recv all negative numbers:
+    MPI_Irecv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, &request);
+#ifdef CUSAN_SYNC
+    MPI_Wait(&request, MPI_STATUS_IGNORE);
+#endif
+    // FIXME: MPI_Wait here to avoid racy d_data access
+    // Set all numbers to positive value:
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+#ifndef CUSAN_SYNC
+    MPI_Wait(&request, MPI_STATUS_IGNORE);
+#endif
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    // cudaDeviceSynchronize();
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      // Expect: all values should be positive, given the p_1 kernel sets them to tid.
+      if (buf_v < 1) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/05_cuda_to_mpi_stream.c b/test/runtime/05_cuda_to_mpi_stream.c
new file mode 100644
index 0000000..d3f305a
--- /dev/null
+++ b/test/runtime/05_cuda_to_mpi_stream.c
@@ -0,0 +1,88 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, size);
+#ifdef CUSAN_SYNC
+    cudaStreamSynchronize(stream);  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpyAsync(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaStreamDestroy(stream);
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/06_cuda_to_mpi_event.c b/test/runtime/06_cuda_to_mpi_event.c
new file mode 100644
index 0000000..676d81b
--- /dev/null
+++ b/test/runtime/06_cuda_to_mpi_event.c
@@ -0,0 +1,87 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    cudaEvent_t event;
+    cudaEventCreate(&event);
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+    cudaEventRecord(event);
+#ifdef CUSAN_SYNC
+    cudaEventSynchronize(event);  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    cudaEventDestroy(event);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+      //      printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/07_cuda_to_mpi_read.c b/test/runtime/07_cuda_to_mpi_read.c
new file mode 100644
index 0000000..8a9f43b
--- /dev/null
+++ b/test/runtime/07_cuda_to_mpi_read.c
@@ -0,0 +1,94 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
+
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel_init(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    arr[tid] = -(tid + 1);
+  }
+}
+
+__global__ void kernel(int* arr, const int N, int* result) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(10000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    result[tid] = arr[tid];
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  if (world_rank == 0) {
+    kernel_init<<<blocksPerGrid, threadsPerBlock>>>(d_data, size);
+    cudaDeviceSynchronize();
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+
+  if (world_rank == 0) {
+    int* d_result;
+    cudaMalloc(&d_result, size * sizeof(int));
+
+    // kernel and Send both only read d_data
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, size, d_result);
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+
+    cudaFree(d_result);
+    cudaDeviceSynchronize();
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v >= 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/08_cudamemcpy_to_mpi.c b/test/runtime/08_cudamemcpy_to_mpi.c
new file mode 100644
index 0000000..021f156
--- /dev/null
+++ b/test/runtime/08_cudamemcpy_to_mpi.c
@@ -0,0 +1,71 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
+
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+// Tsan sometimes crashes with this test it seems
+// FLAKYPASS: *
+// ALLOW_RETRIES: 5
+
+#include "../support/gpu_mpi.h"
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 1 << 26;  // 268mb
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* h_data = (int*)malloc(size * sizeof(int));
+  memset(h_data, 0, size * sizeof(int));
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  cudaStream_t extraStream;
+  cudaStreamCreate(&extraStream);
+
+  if (world_rank == 0) {
+    cudaMemset(d_data, 255, size * sizeof(int));
+    cudaDeviceSynchronize();
+    cudaMemcpy(d_data, h_data, size * sizeof(int), cudaMemcpyHostToDevice);
+    MPI_Send(d_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    // to make sure it doesn't wait for the previous memcpy on default stream we start in another one
+    cudaMemcpyAsync(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost, extraStream);
+    cudaStreamSynchronize(extraStream);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v != 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+    }
+  }
+  free(h_data);
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/09_cudamemcpy_default.c b/test/runtime/09_cudamemcpy_default.c
new file mode 100644
index 0000000..9907f83
--- /dev/null
+++ b/test/runtime/09_cudamemcpy_default.c
@@ -0,0 +1,33 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+int main(int argc, char* argv[]) {
+  const int size = 512;
+  int* h_data    = (int*)malloc(size * sizeof(int));
+  cudaHostRegister(h_data, size * sizeof(int), cudaHostRegisterDefault);
+  int* h_data2;
+  cudaHostAlloc(&h_data2, size * sizeof(int), cudaHostAllocDefault);
+
+  memset(h_data, 0, size * sizeof(int));
+  cudaMemcpy(h_data, h_data, size * sizeof(int), cudaMemcpyDefault);
+  for (int i = 0; i < size; i++) {
+    const int buf_v = h_data[i];
+    if (buf_v != 0) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+  cudaHostUnregister(h_data);
+  cudaFreeHost(h_data2);
+
+  free(h_data);
+  return 0;
+}
diff --git a/test/runtime/10_cudahostalloc.c b/test/runtime/10_cudahostalloc.c
new file mode 100644
index 0000000..bba75d4
--- /dev/null
+++ b/test/runtime/10_cudahostalloc.c
@@ -0,0 +1,21 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck --allow-empty %s
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+int main(int argc, char* argv[]) {
+  const int size = 512;
+  int* h_data1;
+  cudaMallocHost((void**)&h_data1, size * sizeof(int));
+  int* h_data2;
+  cudaHostAlloc(&h_data2, size * sizeof(int), cudaHostAllocDefault);
+  cudaFreeHost(h_data1);
+  cudaFreeHost(h_data2);
+  return 0;
+}
diff --git a/test/runtime/11_cuda_to_mpi_struct_of_buff.c b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
new file mode 100644
index 0000000..31bc0f3
--- /dev/null
+++ b/test/runtime/11_cuda_to_mpi_struct_of_buff.c
@@ -0,0 +1,91 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+
+// CHECK-SYNC-NOT: data race
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  int* buff2;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  if (world_rank == 0) {
+    kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(
+        buffStor, size);  // no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                            size);  // also no problem since they on same stream
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();
+#endif
+    // MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    MPI_Send(buffStor.buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    // MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    MPI_Recv(buffStor.buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+        buffStor, size);  // problem since different stream but same write target
+  }
+
+  cudaFree(buffStor.buff1);
+  cudaFree(buffStor.buff2);
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/11_struct_of_buff.c b/test/runtime/11_struct_of_buff.c
new file mode 100644
index 0000000..11724ff
--- /dev/null
+++ b/test/runtime/11_struct_of_buff.c
@@ -0,0 +1,69 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+
+// CHECK-SYNC-NOT: data race
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  int* buff2;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor,
+                                                          size);  // no problem since kernel 1 and 3 write to different
+  kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);  // also no problem since they on same stream
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(
+      buffStor, size);  // problem since different stream but same write target
+
+  cudaDeviceSynchronize();
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaFree(buffStor.buff1);
+  cudaFree(buffStor.buff2);
+  return 0;
+}
diff --git a/test/runtime/12_struct_ptr.c b/test/runtime/12_struct_ptr.c
new file mode 100644
index 0000000..95a365f
--- /dev/null
+++ b/test/runtime/12_struct_ptr.c
@@ -0,0 +1,98 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// CHECK-DAG: data race
+
+// CHECK-SYNC-NOT: data race
+
+// XFAIL:*
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage{
+  int* buff1;
+  int* buff2;
+};
+
+
+__global__ void kernel1(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff1[tid] = tid*32;
+  }
+}
+__global__ void kernel2(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff2[tid] = tid*32;
+  }
+}
+__global__ void kernel3(BufferStorage* storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage->buff2[tid] = tid*32;
+  }
+}
+
+
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+
+  BufferStorage* buffStor;
+  cudaHostAlloc(&buffStor, sizeof(BufferStorage), 0);
+  cudaMalloc(&buffStor->buff1, size * sizeof(int));
+  cudaMalloc(&buffStor->buff2, size * sizeof(int));
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  if (world_rank == 0) {
+
+    kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//no problem since kernel 1 and 3 write to different
+    kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);//also no problem since they on same stream
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+    MPI_Send(buffStor->buff2, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    //MPI_Send(buffStor.buff1, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    
+  } else if (world_rank == 1) {
+    MPI_Recv(buffStor->buff2, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    //MPI_Recv(buffStor.buff1, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    
+    kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);//problem since different stream but same write target
+  }
+  cudaDeviceSynchronize();
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaFree(buffStor->buff1);
+  cudaFree(buffStor->buff2);
+  cudaFree(buffStor);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/13_struct_recursion.c b/test/runtime/13_struct_recursion.c
new file mode 100644
index 0000000..6705d36
--- /dev/null
+++ b/test/runtime/13_struct_recursion.c
@@ -0,0 +1,73 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+
+// CHECK-SYNC-NOT: data race
+// XFAIL:*
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage2 {
+  int* buff;
+};
+
+struct BufferStorage {
+  BufferStorage2 buff1;
+  BufferStorage2 buff2;
+};
+
+__global__ void kernel1(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1.buff[tid] = tid * 32;
+  }
+}
+__global__ void kernel2(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2.buff[tid] = tid * 32;
+  }
+}
+
+__global__ void kernel3(BufferStorage storage, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff2.buff[tid] = tid * 32;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  BufferStorage buffStor;
+  cudaMalloc(&buffStor.buff1.buff, size * sizeof(int));
+  cudaMalloc(&buffStor.buff2.buff, size * sizeof(int));
+
+  cudaStream_t stream1, stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  kernel1<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);
+  kernel2<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(buffStor, size);
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+  kernel3<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(buffStor, size);
+
+  cudaDeviceSynchronize();
+
+  cudaStreamDestroy(stream2);
+  cudaStreamDestroy(stream1);
+  cudaFree(buffStor.buff1.buff);
+  cudaFree(buffStor.buff2.buff);
+  return 0;
+}
diff --git a/test/runtime/14_cuda_functor.c b/test/runtime/14_cuda_functor.c
new file mode 100644
index 0000000..7bdd380
--- /dev/null
+++ b/test/runtime/14_cuda_functor.c
@@ -0,0 +1,64 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+template <typename F>
+__global__ void kernel_functor(F functor) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  functor(tid);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* d_data;  // Unified Memory pointer
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data, size * sizeof(int));
+  cudaMemset(d_data, 0, size * sizeof(int));
+  cudaDeviceSynchronize();
+  cudaEvent_t endEvent;
+  cudaEventCreate(&endEvent);
+  const auto lamba_kernel = [=] __host__ __device__(const int tid) { d_data[tid] = (tid + 1); };
+  kernel_functor<decltype(lamba_kernel)><<<blocksPerGrid, threadsPerBlock>>>(lamba_kernel);
+  cudaEventRecord(endEvent);
+
+#ifdef CUSAN_SYNC
+  // Wait for the end event to complete (alternative to querying)
+  cudaEventSynchronize(endEvent);
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaEventDestroy(endEvent);
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/runtime/15_cuda_memset_sync.c b/test/runtime/15_cuda_memset_sync.c
new file mode 100644
index 0000000..aa21d8c
--- /dev/null
+++ b/test/runtime/15_cuda_memset_sync.c
@@ -0,0 +1,69 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  int* managed_data2;
+  int* fake_data;
+  int* d_data2;
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMallocManaged(&managed_data2, size * sizeof(int));
+  cudaMallocManaged(&fake_data, 4);
+  cudaMemset(managed_data, 0, size * sizeof(int));
+  cudaMemset(managed_data2, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+#ifdef CUSAN_SYNC
+  cudaMemset(fake_data, 0, 4);
+#endif
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(managed_data2, size, 1);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      break;
+    }
+  }
+
+  cudaFree(d_data2);
+  cudaFree(managed_data);
+
+  return 0;
+}
diff --git a/test/runtime/15_cuda_memset_sync_nonblocking.c b/test/runtime/15_cuda_memset_sync_nonblocking.c
new file mode 100644
index 0000000..6860d1f
--- /dev/null
+++ b/test/runtime/15_cuda_memset_sync_nonblocking.c
@@ -0,0 +1,69 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  int* managed_data2;
+  int* fake_data;
+  int* d_data2;
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+  cudaStreamCreate(&stream2);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMallocManaged(&managed_data2, size * sizeof(int));
+  cudaMallocManaged(&fake_data, 4);
+  cudaMemset(managed_data, 0, size * sizeof(int));
+  cudaMemset(managed_data2, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+  cudaMemset(fake_data, 0, 4);
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(stream1);
+#endif
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(managed_data2, size, 1);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      break;
+    }
+  }
+
+  cudaFree(d_data2);
+  cudaFree(managed_data);
+
+  return 0;
+}
diff --git a/test/pass/16_cuda_hostalloc_implicit_sync.c b/test/runtime/16_cuda_hostalloc_implicit_sync.c
similarity index 87%
rename from test/pass/16_cuda_hostalloc_implicit_sync.c
rename to test/runtime/16_cuda_hostalloc_implicit_sync.c
index 01b4a8b..6e090d2 100644
--- a/test/pass/16_cuda_hostalloc_implicit_sync.c
+++ b/test/runtime/16_cuda_hostalloc_implicit_sync.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
diff --git a/test/pass/16_cuda_malloc_implicit_sync.c b/test/runtime/16_cuda_malloc_implicit_sync.c
similarity index 87%
rename from test/pass/16_cuda_malloc_implicit_sync.c
rename to test/runtime/16_cuda_malloc_implicit_sync.c
index 65360b7..508e834 100644
--- a/test/pass/16_cuda_malloc_implicit_sync.c
+++ b/test/runtime/16_cuda_malloc_implicit_sync.c
@@ -1,8 +1,8 @@
 // clang-format off
-// RUN: %wrapper-cxx %tsan-compile-flags -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-cxx %tsan-compile-flags -DCUSAN_SYNC -O1 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
 
 // clang-format on
diff --git a/test/runtime/17_cuda_stream_query_busy_loop.c b/test/runtime/17_cuda_stream_query_busy_loop.c
new file mode 100644
index 0000000..3f8da15
--- /dev/null
+++ b/test/runtime/17_cuda_stream_query_busy_loop.c
@@ -0,0 +1,61 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+
+#ifdef CUSAN_SYNC
+  while (cudaStreamQuery(stream1) != cudaSuccess) {
+  }
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      // break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaStreamDestroy(stream1);
+  return 0;
+}
diff --git a/test/runtime/18_cuda_event_query_busy_loop.c b/test/runtime/18_cuda_event_query_busy_loop.c
new file mode 100644
index 0000000..53dc573
--- /dev/null
+++ b/test/runtime/18_cuda_event_query_busy_loop.c
@@ -0,0 +1,64 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaEvent_t event1;
+  cudaEventCreate(&event1);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+  cudaEventRecord(event1, stream1);
+
+#ifdef CUSAN_SYNC
+  while (cudaEventQuery(event1) != cudaSuccess) {
+  }
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      // break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaStreamDestroy(stream1);
+  return 0;
+}
diff --git a/test/runtime/18_cuda_event_with_flags_query_busy_loop.c b/test/runtime/18_cuda_event_with_flags_query_busy_loop.c
new file mode 100644
index 0000000..72fc0dc
--- /dev/null
+++ b/test/runtime/18_cuda_event_with_flags_query_busy_loop.c
@@ -0,0 +1,64 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* managed_data;
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaEvent_t event1;
+  cudaEventCreateWithFlags(&event1, cudaEventBlockingSync);
+
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+  cudaEventRecord(event1, stream1);
+
+#ifdef CUSAN_SYNC
+  while (cudaEventQuery(event1) != cudaSuccess) {
+  }
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      // break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaStreamDestroy(stream1);
+  return 0;
+}
diff --git a/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
new file mode 100644
index 0000000..0d50773
--- /dev/null
+++ b/test/runtime/18_cuda_to_mpi_event_query_busy_loop.c
@@ -0,0 +1,86 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <cstdio>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaEvent_t event1;
+  cudaEventCreate(&event1);
+
+  if (world_rank == 0) {
+    cudaMemset(managed_data, 0, size * sizeof(int));
+    write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 1316134912);
+    cudaEventRecord(event1, stream1);
+#ifdef CUSAN_SYNC
+    while (cudaEventQuery(event1) != cudaSuccess) {
+    }
+#endif
+    MPI_Send(managed_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(managed_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    for (int i = 0; i < size; i++) {
+      if (managed_data[i] == 0) {
+        printf("[Error] sync %i\n", managed_data[i]);
+        break;
+      }
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaStreamDestroy(stream1);
+  cudaEventDestroy(event1);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
new file mode 100644
index 0000000..9d47dd4
--- /dev/null
+++ b/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -0,0 +1,73 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* data;
+  // int* data2;
+  int* d_data2;
+  int* h_data  = (int*)malloc(sizeof(int));
+  int* h_data2 = (int*)malloc(sizeof(int));
+
+  int* h_data3 = (int*)malloc(size * sizeof(int));
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  cudaMalloc(&data, size * sizeof(int));
+  cudaMemset(data, 0, size * sizeof(int));
+
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(data, size, 1316134912);
+#ifdef CUSAN_SYNC
+  cudaMemcpy(h_data, h_data2, sizeof(int), cudaMemcpyHostToHost);
+#endif
+  cudaMemcpyAsync(h_data3, data, size * sizeof(int), cudaMemcpyDefault, stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < size; i++) {
+    if (h_data3[i] == 0) {
+      printf("[Error] sync %i\n", h_data3[i]);
+      break;
+    }
+  }
+
+  cudaFree(data);
+
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+
+  return 0;
+}
diff --git a/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c b/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
new file mode 100644
index 0000000..c4773b2
--- /dev/null
+++ b/test/runtime/19_cuda_cudaMemcpyAsyncH2H_implicit_sync_nonblocking.c
@@ -0,0 +1,74 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  int* data;
+  // int* data2;
+  int* d_data2;
+  int* h_data  = (int*)malloc(sizeof(int));
+  int* h_data2 = (int*)malloc(sizeof(int));
+
+  int* h_data3 = (int*)malloc(size * sizeof(int));
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+
+  cudaMalloc(&data, size * sizeof(int));
+  cudaMemset(data, 0, size * sizeof(int));
+
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(data, size, 1316134912);
+  cudaMemcpy(h_data, h_data2, sizeof(int), cudaMemcpyHostToHost);
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(stream1);
+#endif
+  cudaMemcpyAsync(h_data3, data, size * sizeof(int), cudaMemcpyDefault, stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < size; i++) {
+    if (h_data3[i] == 0) {
+      printf("[Error] sync %i\n", h_data3[i]);
+      break;
+    }
+  }
+
+  cudaFree(data);
+
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+
+  return 0;
+}
diff --git a/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
new file mode 100644
index 0000000..dc00896
--- /dev/null
+++ b/test/runtime/19_cuda_to_mpi_send_cudaMemcpyAsyncH2H_implicit_sync.c
@@ -0,0 +1,91 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <cstdio>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+  const int size            = 256;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* data;
+  int* h_data  = (int*)malloc(sizeof(int));
+  int* h_data2 = (int*)malloc(sizeof(int));
+  int* h_data3 = (int*)malloc(size * sizeof(int));
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+
+  cudaMalloc(&data, size * sizeof(int));
+  cudaMemset(data, 0, size * sizeof(int));
+
+  cudaDeviceSynchronize();
+
+  if (world_rank == 0) {
+    write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(data, size, 1316134912);
+#ifdef CUSAN_SYNC
+    cudaMemcpy(h_data, h_data2, sizeof(int), cudaMemcpyHostToHost);
+#endif
+    MPI_Send(data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    cudaMemcpyAsync(h_data3, data, size * sizeof(int), cudaMemcpyDefault, stream1);
+    cudaStreamSynchronize(stream1);
+    for (int i = 0; i < size; i++) {
+      if (h_data3[i] == 0) {
+        printf("[Error] sync %i\n", h_data3[i]);
+        break;
+      }
+    }
+  }
+
+  free(h_data);
+  free(h_data2);
+  free(h_data3);
+  cudaFree(data);
+  cudaStreamDestroy(stream1);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/20_cuda_default_stream_sync.c b/test/runtime/20_cuda_default_stream_sync.c
new file mode 100644
index 0000000..c75b44a
--- /dev/null
+++ b/test/runtime/20_cuda_default_stream_sync.c
@@ -0,0 +1,67 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  int* d_data2;
+  cudaMalloc(&d_data2, size * sizeof(int));
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 99999);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, 0>>>(d_data2, size, 1);
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(0);
+#endif
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i\n", managed_data[i]);
+      break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaFree(d_data2);
+  return 0;
+}
diff --git a/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
new file mode 100644
index 0000000..8812b79
--- /dev/null
+++ b/test/runtime/20_cuda_to_mpi_send_ds_sync_w_r.c
@@ -0,0 +1,89 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+// FLAKYPASS: *
+// ALLOW_RETRIES: 5
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  int* d_data2;
+  cudaMalloc(&d_data2, size * sizeof(int));
+  cudaDeviceSynchronize();
+
+  if (world_rank == 0) {
+    write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 9999999);
+    write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, 0>>>(d_data2, size, 1);
+#ifdef CUSAN_SYNC
+    cudaStreamSynchronize(0);
+#endif
+    MPI_Send(managed_data, size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(managed_data, size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    for (int i = 0; i < size; i++) {
+      if (managed_data[i] == 0) {
+        printf("[Error] sync %i\n", managed_data[i]);
+        break;
+      }
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaFree(d_data2);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/21_chunked_streams_example.c b/test/runtime/21_chunked_streams_example.c
new file mode 100644
index 0000000..24c53ae
--- /dev/null
+++ b/test/runtime/21_chunked_streams_example.c
@@ -0,0 +1,114 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <mpi.h>
+
+#define SENDER 0
+#define RECEIVER 1
+#define CHUNKS 4
+#define SIZE 1024
+// Example size, you can adjust as needed
+
+__global__ void computation_kernel(double* buf, int size) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    buf[idx] = 1.0;  // Example computation
+  }
+}
+
+void computation_on_GPU(double* dev_buf, cudaStream_t kernel_stream) {
+  int threadsPerBlock = 256;
+  int blocksPerGrid   = (SIZE + threadsPerBlock - 1) / threadsPerBlock;
+  computation_kernel<<<blocksPerGrid, threadsPerBlock, 0, kernel_stream>>>(dev_buf, SIZE);
+}
+
+void more_computation_on_GPU(double* dev_buf) {
+  // Placeholder for additional GPU computations
+  // Launch more kernels or perform other GPU tasks here
+}
+
+int main(int argc, char* argv[]) {
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("[Error] This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  double *dev_buf, *host_buf;
+  cudaStream_t kernel_stream, streams[CHUNKS];
+  cudaMalloc(&dev_buf, SIZE * sizeof(double));
+  host_buf = (double*)malloc(SIZE * sizeof(double));
+
+  // Create CUDA streams
+  cudaStreamCreate(&kernel_stream);
+  for (int j = 0; j < CHUNKS; j++) {
+    cudaStreamCreate(&streams[j]);
+  }
+  printf("Created Streams!\n");
+
+  if (world_rank == SENDER) { /* sender */
+    computation_on_GPU(dev_buf, kernel_stream);
+
+    // Explicit GPU sync between GPU streams
+    cudaStreamSynchronize(kernel_stream);
+
+    // Calculate chunk size and offset
+    int chunk_size = SIZE / CHUNKS;
+    for (int j = 0; j < CHUNKS; j++) {
+      int offset = j * chunk_size;
+      cudaMemcpyAsync(host_buf + offset, dev_buf + offset, chunk_size * sizeof(double), cudaMemcpyDeviceToHost,
+                      streams[j]);
+    }
+
+    MPI_Request requests[CHUNKS];
+    for (int j = 0; j < CHUNKS; j++) {
+      // Explicit GPU sync before MPI
+      cudaStreamSynchronize(streams[j]);
+      int offset = j * chunk_size;
+      MPI_Isend(host_buf + offset, chunk_size, MPI_DOUBLE, RECEIVER, 0, MPI_COMM_WORLD, &requests[j]);
+    }
+    MPI_Waitall(CHUNKS, requests, MPI_STATUSES_IGNORE);
+
+    more_computation_on_GPU(dev_buf);
+
+  } else if (world_rank == RECEIVER) { /* receiver */
+    // Calculate chunk size and offset
+    int chunk_size = SIZE / CHUNKS;
+    MPI_Request requests[CHUNKS];
+    for (int j = 0; j < CHUNKS; j++) {
+      int offset = j * chunk_size;
+      MPI_Irecv(host_buf + offset, chunk_size, MPI_DOUBLE, SENDER, 0, MPI_COMM_WORLD, &requests[j]);
+    }
+
+    MPI_Waitall(CHUNKS, requests, MPI_STATUSES_IGNORE);
+
+    // Use the received data (host_buf) on the GPU or CPU as needed
+    // Example: Copy received data to the GPU
+    cudaMemcpy(dev_buf, host_buf, SIZE * sizeof(double), cudaMemcpyHostToDevice);
+    more_computation_on_GPU(dev_buf);
+  }
+
+  // Cleanup
+  cudaFree(dev_buf);
+  free(host_buf);
+  cudaStreamDestroy(kernel_stream);
+  for (int j = 0; j < CHUNKS; j++) {
+    cudaStreamDestroy(streams[j]);
+  }
+
+  MPI_Finalize();
+  return 0;
+}
\ No newline at end of file
diff --git a/test/runtime/22_cuda_to_mpi_partial_buff_write.c b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
new file mode 100644
index 0000000..690d37a
--- /dev/null
+++ b/test/runtime/22_cuda_to_mpi_partial_buff_write.c
@@ -0,0 +1,83 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s --allow-empty
+
+// clang-format on
+
+// CHECK-NOT: data race
+// CHECK-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+  static_assert(size % 2 == 0, "Needs to be divisible by 2");
+  const int half_size = size / 2;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  int* d_data;
+  cudaMalloc(&d_data, size * sizeof(int));
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, half_size);
+    cudaStreamSynchronize(stream);
+    kernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(&d_data[half_size], half_size);
+    MPI_Send(d_data, half_size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+    cudaStreamSynchronize(stream);
+    MPI_Send(&d_data[half_size], half_size, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, half_size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+    MPI_Recv(&d_data[half_size], half_size, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+
+    int* h_data = (int*)malloc(size * sizeof(int));
+    cudaMemcpyAsync(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost, stream);
+    cudaStreamSynchronize(stream);
+    for (int i = 0; i < size; i++) {
+      const int buf_v = h_data[i];
+      if (buf_v == 0) {
+        printf("[Error] sync %i\n", i);
+        break;
+      }
+    }
+    free(h_data);
+  }
+
+  cudaStreamDestroy(stream);
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/23_cuda_default_stream_post_sync.c b/test/runtime/23_cuda_default_stream_post_sync.c
new file mode 100644
index 0000000..008e050
--- /dev/null
+++ b/test/runtime/23_cuda_default_stream_post_sync.c
@@ -0,0 +1,67 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, int value, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = value;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  int* d_data2;
+  cudaMalloc(&d_data2, size * sizeof(int));
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, 0>>>(managed_data, size, 128, 9999999);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data2, size, 0, 1);
+
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(stream);
+#endif
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i %i\n", managed_data[i], i);
+      break;
+    }
+  }
+
+  cudaStreamDestroy(stream);
+  cudaFree(managed_data);
+  cudaFree(d_data2);
+  return 0;
+}
diff --git a/test/runtime/23_cuda_default_stream_post_sync_nonblocking.c b/test/runtime/23_cuda_default_stream_post_sync_nonblocking.c
new file mode 100644
index 0000000..8b7fe9d
--- /dev/null
+++ b/test/runtime/23_cuda_default_stream_post_sync_nonblocking.c
@@ -0,0 +1,67 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, int value, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = value;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream;
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  int* d_data2;
+  cudaMalloc(&d_data2, size * sizeof(int));
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, 0>>>(managed_data, size, 128, 9999999);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data2, size, 0, 1);
+
+  cudaStreamSynchronize(stream);
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i %i\n", managed_data[i], i);
+      break;
+    }
+  }
+
+  cudaStreamDestroy(stream);
+  cudaFree(managed_data);
+  cudaFree(d_data2);
+  return 0;
+}
diff --git a/test/runtime/24_cuda_sync_stream_default_nonblocking.c b/test/runtime/24_cuda_sync_stream_default_nonblocking.c
new file mode 100644
index 0000000..4af7a6f
--- /dev/null
+++ b/test/runtime/24_cuda_sync_stream_default_nonblocking.c
@@ -0,0 +1,63 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, int value, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = value;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream;
+#ifdef CUSAN_SYNC
+  cudaStreamCreate(&stream);
+#else
+  cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+#endif
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(managed_data, size, 128, 9999999);
+  cudaStreamSynchronize(0);
+
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i %i\n", managed_data[i], i);
+      break;
+    }
+  }
+
+  cudaStreamDestroy(stream);
+  cudaFree(managed_data);
+  return 0;
+}
diff --git a/test/runtime/25_cuda_default_stream_double_sync.c b/test/runtime/25_cuda_default_stream_double_sync.c
new file mode 100644
index 0000000..2345193
--- /dev/null
+++ b/test/runtime/25_cuda_default_stream_double_sync.c
@@ -0,0 +1,68 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <unistd.h>
+
+__global__ void write_kernel_delay(int* arr, const int N, int value, const unsigned int delay) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(delay);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  if (tid < N) {
+    arr[tid] = value;
+  }
+}
+
+int main(int argc, char* argv[]) {
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int* managed_data;
+  cudaMallocManaged(&managed_data, size * sizeof(int));
+  cudaMemset(managed_data, 0, size * sizeof(int));
+
+  int* d_data2;
+  cudaMalloc(&d_data2, size * sizeof(int));
+  cudaDeviceSynchronize();
+
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(managed_data, size, 128, 9999999);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, 0>>>(d_data2, size, 0, 1);
+  write_kernel_delay<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(d_data2, size, 128, 1);
+#ifdef CUSAN_SYNC
+  cudaStreamSynchronize(stream2);
+#endif
+  for (int i = 0; i < size; i++) {
+    if (managed_data[i] == 0) {
+      printf("[Error] sync %i %i\n", managed_data[i], i);
+      break;
+    }
+  }
+
+  cudaFree(managed_data);
+  cudaFree(d_data2);
+  return 0;
+}
diff --git a/test/runtime/26_malloc_pitch.c b/test/runtime/26_malloc_pitch.c
new file mode 100644
index 0000000..9843279
--- /dev/null
+++ b/test/runtime/26_malloc_pitch.c
@@ -0,0 +1,93 @@
+// clang-format off
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <assert.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(1000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  if (!has_gpu_aware_mpi()) {
+    printf("This example is designed for CUDA-aware MPI. Exiting.\n");
+    return 1;
+  }
+
+  const int width  = 4;
+  const int height = 8;
+
+  int* d_data;
+  size_t pitch;
+  cudaMallocPitch(&d_data, &pitch, width * sizeof(int), height);
+
+  size_t true_buffer_size = pitch * height;
+  size_t true_n_elements  = true_buffer_size / sizeof(int);
+  // printf("%zu %zu %zu\n", true_buffer_size, true_n_elements, pitch);
+  assert(true_buffer_size % sizeof(int) == 0);
+
+  const int threadsPerBlock = true_n_elements;
+  const int blocksPerGrid   = (true_n_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+  MPI_Init(&argc, &argv);
+  int world_size, world_rank;
+  MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
+
+  if (world_size != 2) {
+    printf("This example is designed for 2 MPI processes. Exiting.\n");
+    MPI_Finalize();
+    return 1;
+  }
+
+  if (world_rank == 0) {
+    kernel<<<blocksPerGrid, threadsPerBlock>>>(d_data, true_n_elements);
+#ifdef CUSAN_SYNC
+    cudaDeviceSynchronize();  // FIXME: uncomment for correct execution
+#endif
+    MPI_Send(d_data, true_n_elements, MPI_INT, 1, 0, MPI_COMM_WORLD);
+  } else if (world_rank == 1) {
+    MPI_Recv(d_data, true_n_elements, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+  }
+
+  if (world_rank == 1) {
+    int* h_data = (int*)malloc(true_buffer_size);
+    cudaMemcpy(h_data, d_data, true_buffer_size, cudaMemcpyDeviceToHost);
+    for (int i = 0; i < true_n_elements; i++) {
+      const int buf_v = h_data[i];
+      // printf("buf[%d] = %d (r%d)\n", i, buf_v, world_rank);
+      if (buf_v == 0) {
+        printf("[Error] sync\n");
+        break;
+      }
+    }
+    free(h_data);
+  }
+
+  cudaFree(d_data);
+  MPI_Finalize();
+  return 0;
+}
diff --git a/test/runtime/27_cuda_memcpy2d_implicit_syn.c b/test/runtime/27_cuda_memcpy2d_implicit_syn.c
new file mode 100644
index 0000000..e9bbbac
--- /dev/null
+++ b/test/runtime/27_cuda_memcpy2d_implicit_syn.c
@@ -0,0 +1,87 @@
+// clang-format off
+
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <assert.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(99000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int width  = 64;
+  const int height = 8;
+
+  int* d_data;
+  size_t pitch;
+  // allocations
+  cudaMallocPitch(&d_data, &pitch, width * sizeof(int), height);
+  int* h_data       = (int*)malloc(width * sizeof(int) * height);
+  int* dummy_h_data = (int*)malloc(width * sizeof(int) * height);
+
+  size_t true_buffer_size = pitch * height;
+  size_t true_n_elements  = true_buffer_size / sizeof(int);
+  assert(true_buffer_size % sizeof(int) == 0);
+  const int threadsPerBlock = true_n_elements;
+  const int blocksPerGrid   = (true_n_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream2);
+
+  // null out all the data
+  cudaMemset2D(d_data, pitch, 0, width, height);
+  memset(h_data, 0, width * sizeof(int) * height);
+  cudaDeviceSynchronize();
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, true_n_elements);
+
+#ifdef CUSAN_SYNC
+  // copy into dummy data buffer causing implicit sync
+  cudaMemcpy2D(dummy_h_data, width * sizeof(int), d_data, pitch, width * sizeof(int), height, cudaMemcpyDeviceToHost);
+#endif
+
+  // do async non blocking copy which will fail if there was no sync between this and the writing kernel
+  cudaMemcpy2DAsync(h_data, width * sizeof(int), d_data, pitch, width * sizeof(int), height, cudaMemcpyDeviceToHost,
+                    stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < width * height; i++) {
+    const int buf_v = h_data[i];
+    // printf("buf[%d] = %d\n", i, buf_v);
+    if (buf_v == 0) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  free(h_data);
+  free(dummy_h_data);
+  cudaFree(d_data);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  return 0;
+}
diff --git a/test/runtime/28_cuda_memset2d_implicit_syn.c b/test/runtime/28_cuda_memset2d_implicit_syn.c
new file mode 100644
index 0000000..733e25b
--- /dev/null
+++ b/test/runtime/28_cuda_memset2d_implicit_syn.c
@@ -0,0 +1,91 @@
+// clang-format off
+
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+#include "../support/gpu_mpi.h"
+
+#include <assert.h>
+
+__global__ void kernel(int* arr, const int N) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+#if __CUDA_ARCH__ >= 700
+    for (int i = 0; i < tid; i++) {
+      __nanosleep(99000000U);
+    }
+#else
+    printf(">>> __CUDA_ARCH__ !\n");
+#endif
+    arr[tid] = (tid + 1);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int width  = 64;
+  const int height = 8;
+
+  int* d_data;
+  size_t pitch;
+  // allocations
+  cudaMallocPitch(&d_data, &pitch, width * sizeof(int), height);
+
+  int* dummy_d_data;
+  size_t dummy_pitch;
+  cudaMallocPitch(&dummy_d_data, &dummy_pitch, width * sizeof(int), height);
+  int* h_data = (int*)malloc(width * sizeof(int) * height);
+
+  size_t true_buffer_size = pitch * height;
+  size_t true_n_elements  = true_buffer_size / sizeof(int);
+  assert(true_buffer_size % sizeof(int) == 0);
+  const int threadsPerBlock = true_n_elements;
+  const int blocksPerGrid   = (true_n_elements + threadsPerBlock - 1) / threadsPerBlock;
+
+  cudaStream_t stream1;
+  cudaStreamCreate(&stream1);
+  cudaStream_t stream2;
+  cudaStreamCreateWithPriority(&stream2, cudaStreamDefault, -1);
+
+  // null out all the data
+  cudaMemset2D(d_data, pitch, 0, width, height);
+  memset(h_data, 0, width * sizeof(int) * height);
+  cudaDeviceSynchronize();
+
+  kernel<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(d_data, true_n_elements);
+
+#ifdef CUSAN_SYNC
+  // copy into dummy data buffer causing implicit sync
+  cudaMemset2D(dummy_d_data, dummy_pitch, 0, width, height);
+#endif
+
+  // do async non blocking copy which will fail if there was no sync between this and the writing kernel
+  cudaMemcpy2DAsync(h_data, width * sizeof(int), d_data, pitch, width * sizeof(int), height, cudaMemcpyDeviceToHost,
+                    stream2);
+  cudaStreamSynchronize(stream2);
+  for (int i = 0; i < width * height; i++) {
+    const int buf_v = h_data[i];
+    // printf("buf[%d] = %d\n", i, buf_v);
+    if (buf_v == 0) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  free(h_data);
+  cudaFree(d_data);
+  cudaFree(dummy_d_data);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  return 0;
+}
diff --git a/test/tsan/01_tsan_cuda_to_mpi.c b/test/runtime/29_tsan_cuda_to_mpi.c
similarity index 86%
rename from test/tsan/01_tsan_cuda_to_mpi.c
rename to test/runtime/29_tsan_cuda_to_mpi.c
index e92a524..8365f31 100644
--- a/test/tsan/01_tsan_cuda_to_mpi.c
+++ b/test/runtime/29_tsan_cuda_to_mpi.c
@@ -1,11 +1,10 @@
 // clang-format off
-// TODO: Fix segfault when program terminates.
-
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
 // clang-format on
 
 // CHECK: [Error] sync
diff --git a/test/tsan/02_tsan_annotate_cuda_to_mpi.c b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
similarity index 87%
rename from test/tsan/02_tsan_annotate_cuda_to_mpi.c
rename to test/runtime/30_tsan_annotate_cuda_to_mpi.c
index 22a29d5..10e5083 100644
--- a/test/tsan/02_tsan_annotate_cuda_to_mpi.c
+++ b/test/runtime/30_tsan_annotate_cuda_to_mpi.c
@@ -1,10 +1,8 @@
 // clang-format off
-// TODO: Fix segfault when program terminates.
-
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %cusan_ldpreload %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 // clang-format on
 
diff --git a/test/tsan/03_tsan_cuda_event.c b/test/runtime/31_tsan_cuda_event.c
similarity index 63%
rename from test/tsan/03_tsan_cuda_event.c
rename to test/runtime/31_tsan_cuda_event.c
index 9a2bd59..f8b1a86 100644
--- a/test/tsan/03_tsan_cuda_event.c
+++ b/test/runtime/31_tsan_cuda_event.c
@@ -1,27 +1,16 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 1 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %mpi-exec -n 1 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
-
 // clang-format on
 
 // CHECK-DAG: data race
 
 // CHECK-SYNC-NOT: data race
 
-// CHECK-LLVM-IR: invoke i32 @cudaEventCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
-// CHECK-LLVM-IR: invoke i32 @cudaEventRecord
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
-
 #include "../support/gpu_mpi.h"
 
 #include <unistd.h>
diff --git a/test/tsan/04_tsan_async_copy.c b/test/runtime/32_tsan_async_copy.c
similarity index 64%
rename from test/tsan/04_tsan_async_copy.c
rename to test/runtime/32_tsan_async_copy.c
index 0b57986..e322ddd 100644
--- a/test/tsan/04_tsan_async_copy.c
+++ b/test/runtime/32_tsan_async_copy.c
@@ -1,28 +1,16 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %mpi-exec -n 2 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
-
 // CHECK-DAG: data race
 // CHECK-DAG: [Error] sync
 
 // CHECK-SYNC-NOT: data race
 // CHECK-SYNC-NOT: [Error] sync
 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate 
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate 
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpyAsync(i8* {{.*}}[[mcpyasy_target:%[0-9a-z]+]], i8* {{.*}}[[mcpyasy_from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy_async(i8* {{.*}}[[mcpyasy_target]], i8* {{.*}}[[mcpyasy_from]], 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamSynchronize
-// CHECK-LLVM-IR: invoke i32 @cudaStreamDestroy 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamDestroy
-
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/tsan/05_tsan_wait_event.c b/test/runtime/33_tsan_wait_event.c
similarity index 63%
rename from test/tsan/05_tsan_wait_event.c
rename to test/runtime/33_tsan_wait_event.c
index 46c8c9c..1c8f27b 100644
--- a/test/tsan/05_tsan_wait_event.c
+++ b/test/runtime/33_tsan_wait_event.c
@@ -1,32 +1,14 @@
 // clang-format off
-// RUN: %wrapper-mpicxx %tsan-compile-flags -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %wrapper-mpicxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
 // RUN: %tsan-options %mpi-exec -n 1 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s -DFILENAME=%s
 
-// RUN: %wrapper-mpicxx %tsan-compile-flags -DCUSAN_SYNC -O2 -g %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %wrapper-mpicxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
 // RUN: %tsan-options %mpi-exec -n 1 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-SYNC
 
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 > test_out.ll
-// RUN: %apply %s --cusan-kernel-data=%t.yaml --show_host_ir -x cuda --cuda-gpu-arch=sm_72 2>&1 | %filecheck %s  -DFILENAME=%s --allow-empty --check-prefix CHECK-LLVM-IR
-
 // CHECK-DAG: data race
 
 // CHECK-SYNC-NOT: data race
 
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[mcpy_target:%[0-9a-z]+]], i8* {{.*}}[[mcpy_from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[mcpy_target]], i8* {{.*}}[[mcpy_from]],
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
-// CHECK-LLVM-IR: invoke i32 @cudaStreamCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_stream 
-// CHECK-LLVM-IR: invoke i32 @cudaEventCreate
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_create_event
-// CHECK-LLVM-IR: invoke i32 @cudaEventRecord
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_event_record
-// CHECK-LLVM-IR: invoke i32 @cudaMemcpy(i8* {{.*}}[[mcpy2_target:%[0-9a-z]+]], i8* {{.*}}[[mcpy2_from:%[0-9a-z]+]],
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_memcpy(i8* {{.*}}[[mcpy2_target]], i8* {{.*}}[[mcpy2_from]],
-// CHECK-LLVM-IR: invoke i32 @cudaStreamSynchronize
-// CHECK-LLVM-IR: {{call|invoke}} void @_cusan_sync_stream
-
 // clang-format on
 
 #include "../support/gpu_mpi.h"
diff --git a/test/runtime/34_negative_array.c b/test/runtime/34_negative_array.c
new file mode 100644
index 0000000..9912bde
--- /dev/null
+++ b/test/runtime/34_negative_array.c
@@ -0,0 +1,63 @@
+// clang-format off
+// RUN: %wrapper-cc %clang_args0 %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cc -DCUSAN_SYNC %clang_args0 %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %cusan_ldpreload %tsan-options %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+
+// clang-format on
+
+// CHECK-DAG: data race
+// CHECK-DAG: [Error] sync
+
+// CHECK-SYNC-NOT: data race
+// CHECK-SYNC-NOT: [Error] sync
+
+// XFAIL:*
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+
+__global__ void kernel(int** data) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+#if __CUDA_ARCH__ >= 700
+  for (int i = 0; i < tid; i++) {
+    __nanosleep(1000000U);
+  }
+#else
+  printf(">>> __CUDA_ARCH__ !\n");
+#endif
+  data[-1][tid] = (tid + 1);
+}
+
+int main() {
+  const int size            = 256;
+  const int threadsPerBlock = 256;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  int** d_data;  // Unified Memory pointer
+  cudaMallocManaged(&d_data, 2 * sizeof(int*));
+
+  // Allocate Unified Memory
+  cudaMallocManaged(&d_data[0], size * sizeof(int));
+  cudaMallocManaged(&d_data[1], size * sizeof(int));
+  cudaMemset(d_data[0], 0, size * sizeof(int));
+  cudaMemset(d_data[1], 0, size * sizeof(int));
+
+  kernel<<<blocksPerGrid, threadsPerBlock>>>(&d_data[1]);
+
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (d_data[0][i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaFree(d_data);
+
+  return 0;
+}
diff --git a/test/runtime/35_struct_null.c b/test/runtime/35_struct_null.c
new file mode 100644
index 0000000..083cb13
--- /dev/null
+++ b/test/runtime/35_struct_null.c
@@ -0,0 +1,61 @@
+// clang-format off
+// RUN: %wrapper-cxx %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t.exe
+// RUN: %tsan-options timeout 1 %cusan_test_dir/%basename_t.exe 2>&1 | %filecheck %s
+
+// RUN: %wrapper-cxx -DCUSAN_SYNC %clang_args %s -x cuda -gencode arch=compute_70,code=sm_70 -o %cusan_test_dir/%basename_t-sync.exe
+// RUN: %tsan-options timeout 1 %cusan_test_dir/%basename_t-sync.exe 2>&1 | %filecheck %s --allow-empty --check-prefix CHECK-SYNC
+// clang-format on
+
+// CHECK-DAG: data race
+
+// CHECK-SYNC-NOT: data race
+
+// XFAIL: *
+
+#include "../support/gpu_mpi.h"
+
+struct BufferStorage {
+  int* buff1;
+  // a list of pointers
+  int** buff2;
+};
+
+__global__ void kernel(BufferStorage storage, const int N, bool write_second) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < N) {
+    storage.buff1[tid] = tid * 32;
+    if (write_second) {
+      storage.buff2[0][tid] = tid * 32;
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  const int size            = 512;
+  const int threadsPerBlock = size;
+  const int blocksPerGrid   = (size + threadsPerBlock - 1) / threadsPerBlock;
+
+  BufferStorage buffStor;
+  cudaMallocManaged(&buffStor.buff1, size * sizeof(int));
+  /// cudaMallocManaged(&buffStor.buff2, sizeof(int*));
+
+  buffStor.buff2 = 0;
+
+  // since we set the boolean argument to false buff2 could contain nullptrs since we don't use it
+  //  but the pass analyses based on the static code and so it doesn't know this runtime information
+  kernel<<<blocksPerGrid, threadsPerBlock, 0>>>(buffStor, size, false);
+#ifdef CUSAN_SYNC
+  cudaDeviceSynchronize();
+#endif
+
+  for (int i = 0; i < size; i++) {
+    if (buffStor.buff1[i] < 1) {
+      printf("[Error] sync\n");
+      break;
+    }
+  }
+
+  cudaFree(buffStor.buff1);
+  cudaFree(buffStor.buff2);
+  return 0;
+}
diff --git a/test/tsan/TSan_External.h b/test/runtime/TSan_External.h
similarity index 100%
rename from test/tsan/TSan_External.h
rename to test/runtime/TSan_External.h
diff --git a/test/tsan/suppressions.txt b/test/runtime/suppressions.txt
similarity index 95%
rename from test/tsan/suppressions.txt
rename to test/runtime/suppressions.txt
index 5a01ec8..3292e71 100644
--- a/test/tsan/suppressions.txt
+++ b/test/runtime/suppressions.txt
@@ -1,6 +1,7 @@
 called_from_lib:libmpi_cxx*
 called_from_lib:libmpi.so*
 called_from_lib:libcuda.so*
+called_from_lib:libtypeartRuntime.so*
 
 # Probably not required, from previous experiments
 #called_from_lib:libucx*
@@ -26,4 +27,3 @@ called_from_lib:libopen-pal.so.40*
 called_from_lib:libopen-rte.so.40*
 called_from_lib:ld-linux-x86-64.so*
 called_from_lib:libmlx5-rdma*
-called_from_lib:libnvidia-ml.so.1