diff --git a/notebooks/ProjectMockup.ipynb b/notebooks/ProjectMockup.ipynb new file mode 100644 index 00000000..e1070353 --- /dev/null +++ b/notebooks/ProjectMockup.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1c4fd5b5-1b9e-4a8e-82b3-6ab214c203ea", + "metadata": {}, + "source": [ + "## Project Mockup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "54a52e27-3d9d-4574-990b-329a47d4bf13", + "metadata": {}, + "outputs": [], + "source": [ + "struct S { double val = 1.0; };" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b9e80eda-8924-4f3c-bbc0-7b74ba9b9a36", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "python_vec = cppyy.gbl.std.vector(cppyy.gbl.S)(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "697760dd-200e-4fa6-85b2-f1786215eda3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1.0\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "print(python_vec[3].val)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d581f6cc-2737-4c11-b37a-25bb5296936a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<__main__.Derived object at 0x7fffdf639e40>\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + ":1: RuntimeWarning: class \"S\" has no virtual destructor\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "class Derived(cppyy.gbl.S):\n", + " def __init__(self):\n", + " val = 0\n", + "res = Derived()\n", + "print(res)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f20feaa6-62a2-4805-853b-268b1c1832ac", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void arr_sum(int n, double *x, double *sum) {\n", + " for(int i = 0; i < n; i++)\n", + " *sum +=x[i];\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cfd92edb-41e0-4bb8-b7a1-852782964423", + "metadata": {}, + "outputs": [], + "source": [ + "int n = 5;\n", + "double h_sum;\n", + "double *x = new double[n];" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "90480051-7a57-4145-85c4-ca3bdc8e101f", + "metadata": {}, + "outputs": [], + "source": [ + "void setData(const std::vector& a) {\n", + " int i = 0;\n", + " for(auto &s : a) {\n", + " x[i] = s.val;\n", + " i++;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9d9913b7-db39-4c76-8939-efbca9256143", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "data_list = [1.0, 2.0, 3.0, 4.0, 5.0]\n", + "for c, i in enumerate(data_list):\n", + " python_vec[c].val = i\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "846901c7-0cbb-4978-9efe-7f5870b939a7", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "cppyy.gbl.setData(python_vec)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f57c46fa-addb-4d80-8cc3-29a464911fdc", + "metadata": {}, + "outputs": [], + "source": [ + "double *d_x, *d_sum;\n", + "cudaMalloc((void **)&d_x, n * sizeof(double));\n", + "cudaMalloc((void **)&d_sum, sizeof(double));" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d617ee2c-d9b7-4c4f-8ef2-051df37b2737", + "metadata": {}, + "outputs": [], + "source": [ + "cudaMemcpy(d_x, x, n * sizeof(double), cudaMemcpyHostToDevice);\n", + "cudaMemcpy(d_sum, &h_sum, sizeof(double), cudaMemcpyHostToDevice);" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8417b211-e7f9-448d-969a-27fc0eb32697", + "metadata": {}, + "outputs": [], + "source": [ + "arr_sum<<<1, 1>>>(n, d_x, d_sum);" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4374b687-31a5-4b85-8d66-e738956814b4", + "metadata": {}, + "outputs": [], + "source": [ + "cudaMemcpy(&h_sum, d_sum, sizeof(double), cudaMemcpyDeviceToHost);" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "8198bfdf-0ff7-4e27-ba6b-087833055794", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sum: 15\n" + ] + } + ], + "source": [ + "std::cout << \"Sum: \" << h_sum << std::endl;" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "33ffcf05-58cc-4212-aef7-4a5813fdc178", + "metadata": {}, + "outputs": [], + "source": [ + "delete[] x;\n", + "cudaFree(d_x);\n", + "cudaFree(d_sum);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "add835ef-6196-47d8-be75-14b872438de1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CUDA (C++17)", + "language": "CUDA", + "name": "cuda-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/Python-Cpp-Integration-Demo.ipynb b/notebooks/Python-Cpp-Integration-Demo.ipynb new file mode 100644 index 00000000..41ae6eab --- /dev/null +++ b/notebooks/Python-Cpp-Integration-Demo.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ca6bccaa", + "metadata": {}, + "source": [ + "## Declaring variables in C++ " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "879a6503", + "metadata": {}, + "outputs": [], + "source": [ + "extern \"C\" int printf(const char*,...);" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2063086d", + "metadata": {}, + "outputs": [], + "source": [ + "int new_var1 = 12;\n", + "int new_var2 = 25;\n", + "int new_var3 = 64;" + ] + }, + { + "cell_type": "markdown", + "id": "6e42ad3d", + "metadata": {}, + "source": [ + "## Running Python with C++ variables" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7edde9fb", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This is printed from Python: Today is Tue Oct 25 11:38:08 2022\n", + "[1, 2, 12, 25, 64]\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "from time import time,ctime\n", + "print('This is printed from Python: Today is', ctime(time()))\n", + "python_array = [1, 2, new_var1, new_var2, new_var3]\n", + "print(python_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "579f1d82", + "metadata": {}, + "outputs": [], + "source": [ + "%%python \n", + "\n", + "new_python_var = 1327\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "43391a7e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "new_python_var = 1327\n" + ] + } + ], + "source": [ + "auto k = printf(\"new_python_var = %d\\n\", new_python_var);" + ] + }, + { + "cell_type": "markdown", + "id": "9030df20", + "metadata": {}, + "source": [ + "## Working with a vector type" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "400e42d3", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "92b2bd05", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector fill_vector(std::vector g) { for (int i = 1; i <= 5; i++){ g.push_back(i);} return g; }" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ea7f2d92", + "metadata": {}, + "outputs": [], + "source": [ + "int size_of_vector (std::vector g) { std::cout << \"Size : \" << g.size(); return g.size(); }" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "760237a2", + "metadata": {}, + "outputs": [], + "source": [ + "int capacity_of_vector (std::vector g) { std::cout << \"Capacity : \" << g.capacity(); return g.capacity(); }" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1c02ee6c", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector cpp_vector = fill_vector(cpp_vector);" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a7fb4cb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Size : 5" + ] + } + ], + "source": [ + "int size = size_of_vector(cpp_vector);" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "95f8e6f7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Capacity : 5" + ] + } + ], + "source": [ + "int capacity = capacity_of_vector(cpp_vector);" + ] + }, + { + "cell_type": "markdown", + "id": "781feea0", + "metadata": {}, + "source": [ + "## Plotting data from C++ and Python" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7ccb6fe7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[1, 2, 3, 4, 5]\n", + "[1, 2, 12, 25, 64]\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "print(cpp_vector)\n", + "print(python_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "531e00ff", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "plt.plot(python_array, cpp_vector,linewidth=3)\n", + "plt.xlabel('Python Array')\n", + "plt.ylabel('Cpp Array')\n", + "plt.savefig('python_cpp_plot.png')" + ] + }, + { + "cell_type": "markdown", + "id": "b1842530", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bf12bfb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++11", + "language": "C++11", + "name": "xcpp11" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/audio/audio.wav b/notebooks/audio/audio.wav new file mode 100644 index 00000000..ad425399 Binary files /dev/null and b/notebooks/audio/audio.wav differ diff --git a/notebooks/clad-demo.ipynb b/notebooks/clad-demo.ipynb new file mode 100644 index 00000000..34684c2c --- /dev/null +++ b/notebooks/clad-demo.ipynb @@ -0,0 +1,113 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "74f52cc2-bfda-4fee-9cd9-94d683a8b7e1", + "metadata": {}, + "outputs": [], + "source": [ + "#include \"clad/Differentiator/Differentiator.h\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "55453d37-e8af-4cb7-8f82-550e9cf88b3c", + "metadata": {}, + "outputs": [], + "source": [ + "// Rosenbrock function declaration\n", + "double rosenbrock_func(double x, double y) {\n", + "return (x - 1) * (x - 1) + 100 * (y - x * x) * (y - x * x);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9098b6e4-87a7-497d-9a1b-dddeb813dc56", + "metadata": {}, + "outputs": [], + "source": [ + "double rosenbrock_forward(double x[], int size) {\n", + " double sum = 0;\n", + " auto rosenbrockX = clad::differentiate(rosenbrock_func, 0);\n", + " auto rosenbrockY = clad::differentiate(rosenbrock_func, 1);\n", + " for (int i = 0; i < size-1; i++) {\n", + " double one = rosenbrockX.execute(x[i], x[i + 1]);\n", + " double two = rosenbrockY.execute(x[i], x[i + 1]);\n", + " sum = sum + one + two;\n", + " }\n", + " return sum;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e771bbdc-3979-43df-9ed9-3429ee713b5d", + "metadata": {}, + "outputs": [], + "source": [ + "const int size = 100000000;\n", + "double Xarray[size];\n", + "for(int i=0;i\n", + "// Based on smallpt, a Path Tracer by Kevin Beason, 2008\n", + "//----------------------------------------------------------------------------//\n", + "\n", + "// To compile the demo please type:\n", + "// path/to/clang -O3 -Xclang -add-plugin -Xclang clad -Xclang -load -Xclang \\\n", + "// path/to/libclad.so -I../../include/ -x c++ -std=c++11 -lstdc++ -lm SmallPT.cpp \\\n", + "// -fopenmp=libiomp5 -o SmallPT\n", + "//\n", + "// To run the demo please type:\n", + "// ./SmallPT 500 && xv image.ppm\n", + "\n", + "// A typical invocation would be:\n", + "// ../../../../../obj/Debug+Asserts/bin/clang -O3 -Xclang -add-plugin -Xclang clad \\\n", + "// -Xclang -load -Xclang ../../../../../obj/Debug+Asserts/lib/libclad.dylib \\\n", + "// -I../../include/ -x c++ -std=c++11 -lstdc++ -lm SmallPT.cpp -fopenmp=libiomp5 -o SmallPT\n", + "// ./SmallPT 500 && xv image.ppm\n", + "\n", + "// Necessary for clad to work include\n", + "#include \"clad/Differentiator/Differentiator.h\"\n", + "\n", + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "\n", + "#include \n", + "#include \n", + "\n", + "Cpp::LoadLibrary(\"libomp\");\n", + "\n", + "//TODO: Try to fix single precision issues\n", + "//#define double float\n", + "\n", + "// Test types (default is TEST_TYPE_BY_CLAD)\n", + "#define TEST_TYPE_BY_HAND\n", + "//#define TEST_TYPE_BY_CLAD\n", + "//#define TEST_TYPE_BY_NUM\n", + "\n", + "\n", + "struct Vec {\n", + " double x, y, z; // position, also color (r,g,b)\n", + "\n", + " Vec(double x_=0, double y_=0, double z_=0) { x=x_; y=y_; z=z_; }\n", + "\n", + " Vec operator+(const Vec &b) const { return Vec(x+b.x, y+b.y, z+b.z); }\n", + " Vec operator-(const Vec &b) const { return Vec(x-b.x, y-b.y, z-b.z); }\n", + " Vec operator*(double b) const { return Vec(x*b, y*b, z*b); }\n", + " double operator*(const Vec &b) const { return x*b.x+y*b.y+z*b.z; } // dot\n", + " Vec operator%(const Vec &b) const { return Vec(y*b.z-z*b.y, z*b.x-x*b.z, x*b.y-y*b.x); } // cross\n", + " Vec mult(const Vec &b) const { return Vec(x*b.x, y*b.y, z*b.z); }\n", + " Vec& norm() { return *this = *this * (1/sqrt(x*x+y*y+z*z)); }\n", + "};\n", + "\n", + "struct Ray {\n", + " Vec o, d; // Origin and direction\n", + " Ray() : o(), d() {}\n", + " Ray(Vec o_, Vec d_): o(o_), d(d_) {}\n", + "};\n", + "\n", + "enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance()\n", + "\n", + "#define inf 1e6\n", + "#define eps 1e-6" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "56215b0d-23f8-4866-b745-ee6b3b881eb2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "/*\n", + "Copyright (C) 2017 Milo Yip. All rights reserved.\n", + "\n", + "Redistribution and use in source and binary forms, with or without\n", + "modification, are permitted provided that the following conditions are met:\n", + "\n", + "* Redistributions of source code must retain the above copyright notice, this\n", + " list of conditions and the following disclaimer.\n", + "\n", + "* Redistributions in binary form must reproduce the above copyright notice,\n", + " this list of conditions and the following disclaimer in the documentation\n", + " and/or other materials provided with the distribution.\n", + "\n", + "* Neither the name of pngout nor the names of its\n", + " contributors may be used to endorse or promote products derived from\n", + " this software without specific prior written permission.\n", + "\n", + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n", + "AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", + "IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n", + "DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n", + "FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n", + "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n", + "SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n", + "CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n", + "OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n", + "*/\n", + "\n", + "/*! \\file\n", + " \\brief svpng() is a minimalistic C function for saving RGB/RGBA image into uncompressed PNG.\n", + " \\author Milo Yip\n", + " \\version 0.1.1\n", + " \\copyright MIT license\n", + " \\sa http://github.com/miloyip/svpng\n", + "*/\n", + "\n", + "#define SVPNG_LINKAGE\n", + "\n", + "#include \n", + "#define SVPNG_OUTPUT FILE* fp\n", + "\n", + "#define SVPNG_PUT(u) fputc(u, fp)\n", + "\n", + "/*!\n", + " \\brief Save a RGB/RGBA image in PNG format.\n", + " \\param SVPNG_OUTPUT Output stream (by default using file descriptor).\n", + " \\param w Width of the image. (<16383)\n", + " \\param h Height of the image.\n", + " \\param img Image pixel data in 24-bit RGB or 32-bit RGBA format.\n", + " \\param alpha Whether the image contains alpha channel.\n", + "*/\n", + "SVPNG_LINKAGE void svpng(SVPNG_OUTPUT, unsigned w, unsigned h, const unsigned char* img, int alpha) {\n", + " static const unsigned t[] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, \n", + " /* CRC32 Table */ 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };\n", + " unsigned a = 1, b = 0, c, p = w * (alpha ? 4 : 3) + 1, x, y, i; /* ADLER-a, ADLER-b, CRC, pitch */\n", + "#define SVPNG_U8A(ua, l) for (i = 0; i < l; i++) SVPNG_PUT((ua)[i]);\n", + "#define SVPNG_U32(u) do { SVPNG_PUT((u) >> 24); SVPNG_PUT(((u) >> 16) & 255); SVPNG_PUT(((u) >> 8) & 255); SVPNG_PUT((u) & 255); } while(0)\n", + "#define SVPNG_U8C(u) do { SVPNG_PUT(u); c ^= (u); c = (c >> 4) ^ t[c & 15]; c = (c >> 4) ^ t[c & 15]; } while(0)\n", + "#define SVPNG_U8AC(ua, l) for (i = 0; i < l; i++) SVPNG_U8C((ua)[i])\n", + "#define SVPNG_U16LC(u) do { SVPNG_U8C((u) & 255); SVPNG_U8C(((u) >> 8) & 255); } while(0)\n", + "#define SVPNG_U32C(u) do { SVPNG_U8C((u) >> 24); SVPNG_U8C(((u) >> 16) & 255); SVPNG_U8C(((u) >> 8) & 255); SVPNG_U8C((u) & 255); } while(0)\n", + "#define SVPNG_U8ADLER(u) do { SVPNG_U8C(u); a = (a + (u)) % 65521; b = (b + a) % 65521; } while(0)\n", + "#define SVPNG_BEGIN(s, l) do { SVPNG_U32(l); c = ~0U; SVPNG_U8AC(s, 4); } while(0)\n", + "#define SVPNG_END() SVPNG_U32(~c)\n", + " SVPNG_U8A(\"\\x89PNG\\r\\n\\32\\n\", 8); /* Magic */\n", + " SVPNG_BEGIN(\"IHDR\", 13); /* IHDR chunk { */\n", + " SVPNG_U32C(w); SVPNG_U32C(h); /* Width & Height (8 bytes) */\n", + " SVPNG_U8C(8); SVPNG_U8C(alpha ? 6 : 2); /* Depth=8, Color=True color with/without alpha (2 bytes) */\n", + " SVPNG_U8AC(\"\\0\\0\\0\", 3); /* Compression=Deflate, Filter=No, Interlace=No (3 bytes) */\n", + " SVPNG_END(); /* } */\n", + " SVPNG_BEGIN(\"IDAT\", 2 + h * (5 + p) + 4); /* IDAT chunk { */\n", + " SVPNG_U8AC(\"\\x78\\1\", 2); /* Deflate block begin (2 bytes) */\n", + " for (y = 0; y < h; y++) { /* Each horizontal line makes a block for simplicity */\n", + " SVPNG_U8C(y == h - 1); /* 1 for the last block, 0 for others (1 byte) */\n", + " SVPNG_U16LC(p); SVPNG_U16LC(~p); /* Size of block in little endian and its 1's complement (4 bytes) */\n", + " SVPNG_U8ADLER(0); /* No filter prefix (1 byte) */\n", + " for (x = 0; x < p - 1; x++, img++)\n", + " SVPNG_U8ADLER(*img); /* Image pixel data */\n", + " }\n", + " SVPNG_U32C((b << 16) | a); /* Deflate block end with adler (4 bytes) */\n", + " SVPNG_END(); /* } */\n", + " SVPNG_BEGIN(\"IEND\", 0); SVPNG_END(); /* IEND chunk {} */\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3e489bc-5bec-4b62-a0b2-d7f7916e1ecf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#include \"xeus-clang-repl/xinterpreter.hpp\"\n", + "#include \"xtl/xbase64.hpp\"\n", + "#include \"nlohmann/json.hpp\"\n", + "#include \n", + "#include \n", + "#include \n", + "\n", + "inline double Clamp(double v, double low = 0.0, double high = 1.0) noexcept\n", + "{\n", + " return fmin(fmax(v, low), high);\n", + "}\n", + "\n", + "int toInt(float x)\n", + "{\n", + " return (int) (pow(Clamp(x, 0.0f, 1.0f), 1.0f / 2.2f) * 255 + 0.5f);\n", + "}\n", + "\n", + "void save(const char* fileName, int width, int height, Vec* data)\n", + "{\n", + " FILE *fp = fopen(fileName, \"wb\");\n", + "\n", + " // Convert from Vector3 array to uchar array\n", + " unsigned char* output = new unsigned char[width * height * 3];\n", + "\n", + " for(int i = 0; i < width * height; i++)\n", + " {\n", + " //printf_s(\"%f %f %f \\n\", data[i].x, data[i].y, data[i].z);\n", + " output[i * 3 + 0] = toInt(data[i].x);\n", + " output[i * 3 + 1] = toInt(data[i].y);\n", + " output[i * 3 + 2] = toInt(data[i].z);\n", + " }\n", + "\n", + " svpng(fp, width, height, output, 0);\n", + "\n", + " fclose(fp);\n", + " delete[] output;\n", + "}\n", + "\n", + "namespace nl = nlohmann;\n", + "\n", + "struct image { \n", + " inline image(const std::string& filename)\n", + " {\n", + " std::ifstream fin(filename, std::ios::binary); \n", + " m_buffer << fin.rdbuf();\n", + " }\n", + " \n", + " std::stringstream m_buffer;\n", + "};\n", + " \n", + "nl::json mime_bundle_repr(const image& i)\n", + "{\n", + " auto bundle = nl::json::object();\n", + " bundle[\"image/png\"] = xtl::base64encode(i.m_buffer.str());\n", + " return bundle;\n", + "}\n", + "\n", + "template void display(const T &t) {\n", + " using ::mime_bundle_repr;\n", + " xeus::get_interpreter().display_data(mime_bundle_repr(t), nl::json::object(),\n", + " nl::json::object());\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e6c43ec-7aab-4488-a506-6ea0f720eb70", + "metadata": {}, + "outputs": [], + "source": [ + "// Abstract Solid\n", + "\n", + "class Solid {\n", + "public:\n", + " Vec e, c; // emission, color\n", + " Refl_t refl; // reflection type (DIFFuse, SPECular, REFRactive)\n", + "\n", + " Solid() : e(), c(), refl() {}\n", + " Solid(Vec e_, Vec c_, Refl_t refl_): e(e_), c(c_), refl(refl_) {}\n", + "\n", + " // returns distance, 0 if nohit\n", + " virtual double intersect(const Ray &r) const { return 0; };\n", + "\n", + " // returns normal vector to surface in point pt\n", + " virtual Vec normal(const Vec &pt) const { return Vec(1,0,0); };\n", + "};\n", + "\n", + "// Abstract Implicit Solid\n", + "\n", + "class ImplicitSolid : public Solid {\n", + "public:\n", + "\n", + " ImplicitSolid() {}\n", + " ImplicitSolid(Vec e_, Vec c_, Refl_t refl_): Solid(e_, c_, refl_) {}\n", + "\n", + " // Return signed distance to nearest point on solid surface\n", + " virtual double distance_func(double x, double y, double z) const {\n", + " return 0;\n", + " }\n", + "\n", + " // TODO: Remove when forward for virtual diff methods is not need\n", + " virtual double distance_func_darg0(double x, double y, double z) const;\n", + " virtual double distance_func_darg1(double x, double y, double z) const;\n", + " virtual double distance_func_darg2(double x, double y, double z) const;\n", + "\n", + " // implicit surface intersection\n", + " // returns distance, 0 if nohit\n", + " double intersect(const Ray &r) const override {\n", + " double t=2*eps, t1, f;\n", + " Vec pt;\n", + " do {\n", + " pt=r.o+r.d*t;\n", + " f=fabs(distance_func(pt.x, pt.y, pt.z));\n", + " t1=t;\n", + " t+=f;\n", + " if (f1 ? 1 : x;\n", + "}\n", + "\n", + "inline int toInt(double x) {\n", + " return int(pow(clamp(x),1/2.2)*255+.5);\n", + "}\n", + "\n", + "inline bool intersect(const Ray &ray, double &t, int &id) {\n", + " double d;\n", + "\n", + " t = inf;\n", + " for(int i=sizeof(scene)/sizeof(scene[0]); i--; ) {\n", + " if ((d = scene[i]->intersect(ray)) && df.y && f.x>f.z ? f.x : f.y>f.z ? f.y : f.z; // max refl\n", + "\n", + " // object intersection id map test\n", + " //return obj.c;\n", + "\n", + " cl = cl + cf.mult(obj.e);\n", + "\n", + " if (++depth>5 || !p) {\n", + " if (erand48(Xi).1 ? Vec(0,1) : Vec(1))%w).norm(), v=w%u;\n", + " Vec d = (u*cos(r1)*r2s+v*sin(r1)*r2s+w*sqrt(1-r2)).norm();\n", + "\n", + " r = Ray(x, d);\n", + " continue;\n", + " } else if (obj.refl == SPEC) { // Ideal SPECULAR reflection\n", + " //return obj.e + f.mult(radiance(Ray(x,r.d-n*2*(n*r.d)), depth, Xi));\n", + " r = Ray(x, r.d-n*2*(n*r.d));\n", + " continue;\n", + " }\n", + "\n", + " // Ideal dielectric REFRACTION\n", + " Ray reflRay(x, r.d-n*2*(n*r.d));\n", + " bool into = n*nl>0; // Ray from outside going in?\n", + " double nc=1, nt=1.5, nnt=into ? nc/nt : nt/nc, ddn=r.d*nl, cos2t;\n", + "\n", + " if ((cos2t=1-nnt*nnt*(1-ddn*ddn))<0) { // Total internal reflection\n", + " //return obj.e + f.mult(radiance(reflRay, depth, Xi));\n", + " r = reflRay;\n", + " continue;\n", + " }\n", + "\n", + " Vec tdir = (r.d*nnt-n*((into?1:-1)*(ddn*nnt+sqrt(cos2t)))).norm();\n", + " double a=nt-nc, b=nt+nc, R0=a*a/(b*b), c=1-(into?-ddn:tdir*n);\n", + " double Re=R0+(1-R0)*c*c*c*c*c, Tr=1-Re, P=.25+.5*Re, RP=Re/P, TP=Tr/(1-P);\n", + "\n", + " //return obj.e + f.mult(depth>2 ? // Russian roulette\n", + " // (erand48(Xi)

\n", + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "\n", + "namespace smallpt {\n", + "\n", + " constexpr double g_pi = 3.14159265358979323846;\n", + "\n", + " __host__ __device__ inline double Clamp(double v, \n", + " double low = 0.0, \n", + " double high = 1.0) noexcept\n", + " {\n", + " return fmin(fmax(v, low), high);\n", + " }\n", + "\n", + " inline std::uint8_t ToByte(double color, double gamma = 2.2) noexcept\n", + " {\n", + " const double gcolor = std::pow(color, 1.0 / gamma);\n", + " return static_cast< std::uint8_t >(Clamp(255.0 * gcolor, 0.0, 255.0));\n", + " }\n", + "\n", + " inline void HandleError(cudaError_t err, const char* file, int line) {\n", + " if (cudaSuccess != err) {\n", + " std::printf(\"%s in %s at line %d\\n\", \n", + " cudaGetErrorString(err), file, line);\n", + " std::exit(EXIT_FAILURE);\n", + " }\n", + " }\n", + "}\n", + "\n", + "#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))\n", + "\n", + "#define HANDLE_NULL(a) {if (a == NULL) { \\\n", + " std::printf(\"Host memory failed in %s at line %d\\n\", __FILE__, __LINE__); \\\n", + " std::exit(EXIT_FAILURE);}}" + ] + }, + { + "cell_type": "markdown", + "id": "9b44f208-5b50-45c5-a644-acf7d60552ff", + "metadata": { + "tags": [] + }, + "source": [ + "## Vector helpers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6d4988ce-575f-42ba-a96d-be0eabc113d3", + "metadata": {}, + "outputs": [], + "source": [ + "//#include \"math.hpp\"\n", + "\n", + "namespace smallpt {\n", + "\n", + " struct Vector3 {\n", + "\n", + " public:\n", + "\n", + " __host__ __device__ explicit Vector3(double xyz = 0.0) noexcept\n", + " : Vector3(xyz, xyz, xyz) {}\n", + " __host__ __device__ Vector3(double x, double y, double z) noexcept\n", + " : m_x(x), m_y(y), m_z(z) {}\n", + " Vector3(const Vector3& v) noexcept = default;\n", + " Vector3(Vector3&& v) noexcept = default;\n", + " ~Vector3() = default;\n", + "\n", + " Vector3& operator=(const Vector3& v) = default;\n", + " Vector3& operator=(Vector3&& v) = default;\n", + "\n", + " __device__ bool HasNaNs() const {\n", + " return std::isnan(m_x) || std::isnan(m_y) || std::isnan(m_z);\n", + " }\n", + "\n", + " __device__ const Vector3 operator-() const {\n", + " return { -m_x, -m_y, -m_z };\n", + " }\n", + "\n", + " __device__ const Vector3 operator+(const Vector3& v) const {\n", + " return { m_x + v.m_x, m_y + v.m_y, m_z + v.m_z };\n", + " }\n", + " __device__ const Vector3 operator-(const Vector3& v) const {\n", + " return { m_x - v.m_x, m_y - v.m_y, m_z - v.m_z };\n", + " }\n", + " __device__ const Vector3 operator*(const Vector3& v) const {\n", + " return { m_x * v.m_x, m_y * v.m_y, m_z * v.m_z };\n", + " }\n", + " __device__ const Vector3 operator/(const Vector3& v) const {\n", + " return { m_x / v.m_x, m_y / v.m_y, m_z / v.m_z };\n", + " }\n", + " __device__ const Vector3 operator+(double a) const {\n", + " return { m_x + a, m_y + a, m_z + a };\n", + " }\n", + " __device__ const Vector3 operator-(double a) const {\n", + " return { m_x - a, m_y - a, m_z - a };\n", + " }\n", + " __device__ const Vector3 operator*(double a) const {\n", + " return { m_x * a, m_y * a, m_z * a };\n", + " }\n", + " __device__ const Vector3 operator/(double a) const {\n", + " const double inv_a = 1.0 / a;\n", + " return { m_x * inv_a, m_y * inv_a, m_z * inv_a };\n", + " }\n", + "\n", + " __device__ Vector3& operator+=(const Vector3& v) {\n", + " m_x += v.m_x;\n", + " m_y += v.m_y;\n", + " m_z += v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator-=(const Vector3& v) {\n", + " m_x -= v.m_x;\n", + " m_y -= v.m_y;\n", + " m_z -= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator*=(const Vector3& v) {\n", + " m_x *= v.m_x;\n", + " m_y *= v.m_y;\n", + " m_z *= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator/=(const Vector3& v) {\n", + " m_x /= v.m_x;\n", + " m_y /= v.m_y;\n", + " m_z /= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator+=(double a) {\n", + " m_x += a;\n", + " m_y += a;\n", + " m_z += a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator-=(double a) {\n", + " m_x -= a;\n", + " m_y -= a;\n", + " m_z -= a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator*=(double a) {\n", + " m_x *= a;\n", + " m_y *= a;\n", + " m_z *= a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator/=(double a) {\n", + " const double inv_a = 1.0 / a;\n", + " m_x *= inv_a;\n", + " m_y *= inv_a;\n", + " m_z *= inv_a;\n", + " return *this;\n", + " }\n", + "\n", + " __device__ double Dot(const Vector3& v) const {\n", + " return m_x * v.m_x + m_y * v.m_y + m_z * v.m_z;\n", + " }\n", + " __device__ const Vector3 Cross(const Vector3& v) const {\n", + " return {\n", + " m_y * v.m_z - m_z * v.m_y,\n", + " m_z * v.m_x - m_x * v.m_z,\n", + " m_x * v.m_y - m_y * v.m_x\n", + " };\n", + " }\n", + "\n", + " __device__ bool operator==(const Vector3& rhs) const {\n", + " return m_x == rhs.m_x && m_y == rhs.m_y && m_z == rhs.m_z;\n", + " }\n", + " __device__ bool operator!=(const Vector3& rhs) const {\n", + " return !(*this == rhs);\n", + " }\n", + "\n", + " __device__ double& operator[](std::size_t i) {\n", + " return (&m_x)[i];\n", + " }\n", + " __device__ double operator[](std::size_t i) const {\n", + " return (&m_x)[i];\n", + " }\n", + "\n", + " __device__ std::size_t MinDimension() const {\n", + " return (m_x < m_y && m_x < m_z) ? 0u : ((m_y < m_z) ? 1u : 2u);\n", + " }\n", + " __device__ std::size_t MaxDimension() const {\n", + " return (m_x > m_y && m_x > m_z) ? 0u : ((m_y > m_z) ? 1u : 2u);\n", + " }\n", + " __device__ double Min() const {\n", + " return fmin(m_x, fmin(m_y, m_z));\n", + " }\n", + " __device__ double Max() const {\n", + " return fmax(m_x, fmax(m_y, m_z));\n", + " }\n", + "\n", + " __device__ double Norm2_squared() const {\n", + " return m_x * m_x + m_y * m_y + m_z * m_z;\n", + " }\n", + "\n", + " __device__ double Norm2() const {\n", + " return std::sqrt(Norm2_squared());\n", + " }\n", + "\n", + " __device__ void Normalize() {\n", + " const double a = 1.0 / Norm2();\n", + " m_x *= a;\n", + " m_y *= a;\n", + " m_z *= a;\n", + " }\n", + "\n", + " double m_x, m_y, m_z;\n", + " };\n", + "\n", + " __device__ inline const Vector3 operator+(double a, const Vector3& v) {\n", + " return { a + v.m_x, a + v.m_y, a + v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator-(double a, const Vector3& v) {\n", + " return { a - v.m_x, a - v.m_y, a - v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator*(double a, const Vector3& v) {\n", + " return { a * v.m_x, a * v.m_y, a * v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator/(double a, const Vector3& v) {\n", + " return { a / v.m_x, a / v.m_y, a / v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Sqrt(const Vector3& v) {\n", + " return {\n", + " std::sqrt(v.m_x),\n", + " std::sqrt(v.m_y),\n", + " std::sqrt(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Pow(const Vector3& v, double a) {\n", + " return {\n", + " std::pow(v.m_x, a),\n", + " std::pow(v.m_y, a),\n", + " std::pow(v.m_z, a)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Abs(const Vector3& v) {\n", + " return {\n", + " std::abs(v.m_x),\n", + " std::abs(v.m_y),\n", + " std::abs(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Min(const Vector3& v1, const Vector3& v2) {\n", + " return {\n", + " fmin(v1.m_x, v2.m_x),\n", + " fmin(v1.m_y, v2.m_y),\n", + " fmin(v1.m_z, v2.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Max(const Vector3& v1, const Vector3& v2) {\n", + " return {\n", + " fmax(v1.m_x, v2.m_x),\n", + " fmax(v1.m_y, v2.m_y),\n", + " fmax(v1.m_z, v2.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Round(const Vector3& v) {\n", + " return {\n", + " std::round(v.m_x),\n", + " std::round(v.m_y),\n", + " std::round(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Floor(const Vector3& v) {\n", + " return {\n", + " std::floor(v.m_x),\n", + " std::floor(v.m_y),\n", + " std::floor(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Ceil(const Vector3& v) {\n", + " return {\n", + " std::ceil(v.m_x),\n", + " std::ceil(v.m_y),\n", + " std::ceil(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Trunc(const Vector3& v) {\n", + " return {\n", + " std::trunc(v.m_x),\n", + " std::trunc(v.m_y),\n", + " std::trunc(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Clamp(const Vector3& v, \n", + " double low = 0.0, \n", + " double high = 1.0) {\n", + " return {\n", + " Clamp(v.m_x, low, high),\n", + " Clamp(v.m_y, low, high),\n", + " Clamp(v.m_z, low, high) }\n", + " ;\n", + " }\n", + "\n", + " __device__ inline const Vector3 Lerp(double a, \n", + " const Vector3& v1, \n", + " const Vector3& v2) {\n", + " return v1 + a * (v2 - v1);\n", + " }\n", + "\n", + " template< std::size_t X, std::size_t Y, std::size_t Z >\n", + " __device__ inline const Vector3 Permute(const Vector3& v) {\n", + " return { v[X], v[Y], v[Z] };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Normalize(const Vector3& v) {\n", + " const double a = 1.0 / v.Norm2();\n", + " return a * v;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "9b06a4dd-381d-491c-a963-60066108944c", + "metadata": { + "tags": [] + }, + "source": [ + "## Image IO helper" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b05a70cb-e401-4b1b-b5d3-9ee911ec4da9", + "metadata": {}, + "outputs": [], + "source": [ + "//#include \"vector.hpp\"\n", + "#include \n", + "\n", + "#ifdef __unix\n", + "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n", + "#endif\n", + "\n", + "namespace smallpt {\n", + "\n", + " inline void WritePPM(std::uint32_t w, \n", + " std::uint32_t h, \n", + " const Vector3* Ls, \n", + " const char* fname = \"cu-image.ppm\") noexcept {\n", + " \n", + " FILE* fp;\n", + " \n", + " fopen_s(&fp, fname, \"w\");\n", + " \n", + " std::fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n", + " for (std::size_t i = 0; i < w * h; ++i) {\n", + " std::fprintf(fp, \"%u %u %u \", \n", + " ToByte(Ls[i].m_x), \n", + " ToByte(Ls[i].m_y), \n", + " ToByte(Ls[i].m_z));\n", + " }\n", + " \n", + " std::fclose(fp);\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c1a91c9d-a269-431e-9a8b-3620e6ceafe1", + "metadata": {}, + "source": [ + "### SV PNG" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9cf31b05-61a6-486d-a584-792e45ee30d0", + "metadata": {}, + "outputs": [], + "source": [ + "/*\n", + "Copyright (C) 2017 Milo Yip. All rights reserved.\n", + "\n", + "Redistribution and use in source and binary forms, with or without\n", + "modification, are permitted provided that the following conditions are met:\n", + "\n", + "* Redistributions of source code must retain the above copyright notice, this\n", + " list of conditions and the following disclaimer.\n", + "\n", + "* Redistributions in binary form must reproduce the above copyright notice,\n", + " this list of conditions and the following disclaimer in the documentation\n", + " and/or other materials provided with the distribution.\n", + "\n", + "* Neither the name of pngout nor the names of its\n", + " contributors may be used to endorse or promote products derived from\n", + " this software without specific prior written permission.\n", + "\n", + "THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS IS\"\n", + "AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", + "IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE\n", + "DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE\n", + "FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\n", + "DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR\n", + "SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\n", + "CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,\n", + "OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", + "OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n", + "*/\n", + "\n", + "/*! \\file\n", + " \\brief svpng() is a minimalistic C function for saving RGB/RGBA image into uncompressed PNG.\n", + " \\author Milo Yip\n", + " \\version 0.1.1\n", + " \\copyright MIT license\n", + " \\sa http://github.com/miloyip/svpng\n", + "*/\n", + "\n", + "///#ifndef SVPNG_INC_\n", + "///#define SVPNG_INC_\n", + "\n", + "/*! \\def SVPNG_LINKAGE\n", + " \\brief User customizable linkage for svpng() function.\n", + " By default this macro is empty.\n", + " User may define this macro as static for static linkage, \n", + " and/or inline in C99/C++, etc.\n", + "*/\n", + "///#ifndef SVPNG_LINKAGE\n", + "#define SVPNG_LINKAGE\n", + "///#endif\n", + "\n", + "/*! \\def SVPNG_OUTPUT\n", + " \\brief User customizable output stream.\n", + " By default, it uses C file descriptor and fputc() to output bytes.\n", + " In C++, for example, user may use std::ostream or std::vector instead.\n", + "*/\n", + "///#ifndef SVPNG_OUTPUT\n", + "#include \n", + "#define SVPNG_OUTPUT FILE* fp\n", + "///#endif\n", + "\n", + "/*! \\def SVPNG_PUT\n", + " \\brief Write a byte\n", + "*/\n", + "///#ifndef SVPNG_PUT\n", + "#define SVPNG_PUT(u) fputc(u, fp)\n", + "///#endif\n", + "\n", + "\n", + "/*!\n", + " \\brief Save a RGB/RGBA image in PNG format.\n", + " \\param SVPNG_OUTPUT Output stream (by default using file descriptor).\n", + " \\param w Width of the image. (<16383)\n", + " \\param h Height of the image.\n", + " \\param img Image pixel data in 24-bit RGB or 32-bit RGBA format.\n", + " \\param alpha Whether the image contains alpha channel.\n", + "*/\n", + "SVPNG_LINKAGE void svpng(SVPNG_OUTPUT, unsigned w, unsigned h, const unsigned char* img, int alpha) {\n", + " static const unsigned t[] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c, \n", + " /* CRC32 Table */ 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };\n", + " unsigned a = 1, b = 0, c, p = w * (alpha ? 4 : 3) + 1, x, y, i; /* ADLER-a, ADLER-b, CRC, pitch */\n", + "#define SVPNG_U8A(ua, l) for (i = 0; i < l; i++) SVPNG_PUT((ua)[i]);\n", + "#define SVPNG_U32(u) do { SVPNG_PUT((u) >> 24); SVPNG_PUT(((u) >> 16) & 255); SVPNG_PUT(((u) >> 8) & 255); SVPNG_PUT((u) & 255); } while(0)\n", + "#define SVPNG_U8C(u) do { SVPNG_PUT(u); c ^= (u); c = (c >> 4) ^ t[c & 15]; c = (c >> 4) ^ t[c & 15]; } while(0)\n", + "#define SVPNG_U8AC(ua, l) for (i = 0; i < l; i++) SVPNG_U8C((ua)[i])\n", + "#define SVPNG_U16LC(u) do { SVPNG_U8C((u) & 255); SVPNG_U8C(((u) >> 8) & 255); } while(0)\n", + "#define SVPNG_U32C(u) do { SVPNG_U8C((u) >> 24); SVPNG_U8C(((u) >> 16) & 255); SVPNG_U8C(((u) >> 8) & 255); SVPNG_U8C((u) & 255); } while(0)\n", + "#define SVPNG_U8ADLER(u) do { SVPNG_U8C(u); a = (a + (u)) % 65521; b = (b + a) % 65521; } while(0)\n", + "#define SVPNG_BEGIN(s, l) do { SVPNG_U32(l); c = ~0U; SVPNG_U8AC(s, 4); } while(0)\n", + "#define SVPNG_END() SVPNG_U32(~c)\n", + " SVPNG_U8A(\"\\x89PNG\\r\\n\\32\\n\", 8); /* Magic */\n", + " SVPNG_BEGIN(\"IHDR\", 13); /* IHDR chunk { */\n", + " SVPNG_U32C(w); SVPNG_U32C(h); /* Width & Height (8 bytes) */\n", + " SVPNG_U8C(8); SVPNG_U8C(alpha ? 6 : 2); /* Depth=8, Color=True color with/without alpha (2 bytes) */\n", + " SVPNG_U8AC(\"\\0\\0\\0\", 3); /* Compression=Deflate, Filter=No, Interlace=No (3 bytes) */\n", + " SVPNG_END(); /* } */\n", + " SVPNG_BEGIN(\"IDAT\", 2 + h * (5 + p) + 4); /* IDAT chunk { */\n", + " SVPNG_U8AC(\"\\x78\\1\", 2); /* Deflate block begin (2 bytes) */\n", + " for (y = 0; y < h; y++) { /* Each horizontal line makes a block for simplicity */\n", + " SVPNG_U8C(y == h - 1); /* 1 for the last block, 0 for others (1 byte) */\n", + " SVPNG_U16LC(p); SVPNG_U16LC(~p); /* Size of block in little endian and its 1's complement (4 bytes) */\n", + " SVPNG_U8ADLER(0); /* No filter prefix (1 byte) */\n", + " for (x = 0; x < p - 1; x++, img++)\n", + " SVPNG_U8ADLER(*img); /* Image pixel data */\n", + " }\n", + " SVPNG_U32C((b << 16) | a); /* Deflate block end with adler (4 bytes) */\n", + " SVPNG_END(); /* } */\n", + " SVPNG_BEGIN(\"IEND\", 0); SVPNG_END(); /* IEND chunk {} */\n", + "}\n", + "\n", + "///#endif /* SVPNG_INC_ */" + ] + }, + { + "cell_type": "markdown", + "id": "29109b8c-ac5e-4826-aa48-6e167a48c059", + "metadata": {}, + "source": [ + "### Image helper" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "de44523b-d34f-4503-9796-1e124d20eea5", + "metadata": {}, + "outputs": [], + "source": [ + "int toInt(float x)\n", + "{\n", + " return (int) (pow(smallpt::Clamp(x, 0.0f, 1.0f), 1.0f / 2.2f) * 255 + 0.5f);\n", + "}\n", + "\n", + "void save(const char* fileName, int width, int height, smallpt::Vector3* data)\n", + "{\n", + " FILE *fp = fopen(fileName, \"wb\");\n", + "\n", + " // Convert from float3 array to uchar array\n", + " unsigned char* output = new unsigned char[width * height * 3];\n", + "\n", + " for(int i = 0; i < width * height; i++)\n", + " {\n", + " //printf_s(\"%f %f %f \\n\", data[i].x, data[i].y, data[i].z);\n", + " output[i * 3 + 0] = toInt(data[i].m_x);\n", + " output[i * 3 + 1] = toInt(data[i].m_y);\n", + " output[i * 3 + 2] = toInt(data[i].m_z);\n", + " }\n", + "\n", + " svpng(fp, width, height, output, 0);\n", + " fclose(fp);\n", + " delete[] output;\n", + "}\n", + "\n", + "//#include \"xeus/xjson.hpp\"\n", + "/*\n", + "inline std::string base64encode(const std::string& input)\n", + "{\n", + " std::string output;\n", + " int val = 0;\n", + " int valb = -6;\n", + " for (char sc : input)\n", + " {\n", + " unsigned char c = static_cast(sc);\n", + " val = (val << 8) + c;\n", + " valb += 8;\n", + " while (valb >= 0) {\n", + " output.push_back(\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\"[(val >> valb) & 0x3F]);\n", + " valb -= 6;\n", + " }\n", + " }\n", + " if (valb > -6) {\n", + " output.push_back(\"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/\"[((val << 8) >> (valb + 8)) & 0x3F]);\n", + " }\n", + " while (output.size() % 4) {\n", + " output.push_back('=');\n", + " }\n", + " return output;\n", + "}\n", + "*/\n", + "/*\n", + "// display image in the notebook\n", + "void display_image(std::vector &image, bool clear_ouput){\n", + " // memory objects for output in the web browser\n", + " std::stringstream buffer;\n", + " xeus::xjson mine;\n", + "\n", + " if(clear_ouput)\n", + " xeus::get_interpreter().clear_output(true);\n", + "\n", + " buffer.str(\"\");\n", + " for(auto c : image){\n", + " buffer << c;\n", + " }\n", + "\n", + " mine[\"image/png\"] = xtl::base64encode(buffer.str());\n", + " xeus::get_interpreter().display_data(\n", + " std::move(mine),\n", + " xeus::xjson::object(),\n", + " xeus::xjson::object());\n", + "}\n", + "*/" + ] + }, + { + "cell_type": "markdown", + "id": "caa7fb92-d64d-45b9-8d4f-9f9d81e75bfa", + "metadata": { + "tags": [] + }, + "source": [ + "# CUDA SmallPT main" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd12e5f-59ed-4367-8d22-65838179dc33", + "metadata": {}, + "outputs": [], + "source": [ + "//#include \"imageio.hpp\"\n", + "//#include \"sampling.cuh\"\n", + "//#include \"specular.cuh\"\n", + "//#include \"sphere.hpp\"\n", + "\n", + "//#include \"cuda_tools.hpp\"\n", + "#include \"curand_kernel.h\"\n", + "\n", + "#define REFRACTIVE_INDEX_OUT 1.0\n", + "#define REFRACTIVE_INDEX_IN 1.5\n", + "#define EPSILON_SPHERE 1e-4\n", + "\n", + "namespace smallpt {\n", + "\n", + " struct __align__(16)Ray {\n", + " __device__ explicit Ray(Vector3 o, \n", + " Vector3 d, \n", + " double tmin = 0.0, \n", + " double tmax = std::numeric_limits< double >::infinity(), \n", + " std::uint32_t depth = 0u) noexcept\n", + " : m_o(std::move(o)),\n", + " m_d(std::move(d)),\n", + " m_tmin(tmin),\n", + " m_tmax(tmax),\n", + " m_depth(depth) {};\n", + " Ray(const Ray& ray) noexcept = default;\n", + " Ray(Ray&& ray) noexcept = default;\n", + " ~Ray() = default;\n", + "\n", + " Ray& operator=(const Ray& ray) = default;\n", + " Ray& operator=(Ray&& ray) = default;\n", + "\n", + " __device__ const Vector3 operator()(double t) const {\n", + " return m_o + m_d * t;\n", + " }\n", + "\n", + " Vector3 m_o, m_d;\n", + " mutable double m_tmin, m_tmax;\n", + " std::uint32_t m_depth;\n", + " };\n", + "\n", + " enum struct Reflection_t {\n", + " Diffuse,\n", + " Specular,\n", + " Refractive\n", + " };\n", + "\n", + " struct __align__(16) Sphere {\n", + " __host__ __device__ explicit Sphere(double r,\n", + " Vector3 p,\n", + " Vector3 e,\n", + " Vector3 f,\n", + " Reflection_t reflection_t) noexcept\n", + " : m_r(r),\n", + " m_p(std::move(p)),\n", + " m_e(std::move(e)),\n", + " m_f(std::move(f)),\n", + " m_reflection_t(reflection_t) {}\n", + " Sphere(const Sphere& sphere) noexcept = default;\n", + " Sphere(Sphere&& sphere) noexcept = default;\n", + " ~Sphere() = default;\n", + "\n", + " Sphere& operator=(const Sphere& sphere) = default;\n", + " Sphere& operator=(Sphere&& sphere) = default;\n", + " \n", + " __device__ bool Intersect(const Ray& ray) const {\n", + " const Vector3 op = m_p - ray.m_o;\n", + " const double dop = ray.m_d.Dot(op);\n", + " const double D = dop * dop - op.Dot(op) + m_r * m_r;\n", + "\n", + " if (0.0 > D) {\n", + " return false;\n", + " }\n", + "\n", + " const double sqrtD = sqrt(D);\n", + "\n", + " const double tmin = dop - sqrtD;\n", + " if (ray.m_tmin < tmin && tmin < ray.m_tmax) {\n", + " ray.m_tmax = tmin;\n", + " return true;\n", + " }\n", + "\n", + " const double tmax = dop + sqrtD;\n", + " if (ray.m_tmin < tmax && tmax < ray.m_tmax) {\n", + " ray.m_tmax = tmax;\n", + " return true;\n", + " }\n", + "\n", + " return false;\n", + " }\n", + "\n", + " double m_r;\n", + " Vector3 m_p; // position\n", + " Vector3 m_e; // emission\n", + " Vector3 m_f; // reflection\n", + " Reflection_t m_reflection_t;\n", + " };\n", + " \n", + " const Sphere g_spheres[] = {\n", + " Sphere(1e5, Vector3(1e5 + 1, 40.8, 81.6), Vector3(), Vector3(0.75,0.25,0.25), Reflection_t::Diffuse), //Left\n", + " Sphere(1e5, Vector3(-1e5 + 99, 40.8, 81.6), Vector3(), Vector3(0.25,0.25,0.75), Reflection_t::Diffuse), //Right\n", + " Sphere(1e5, Vector3(50, 40.8, 1e5), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Back\n", + " Sphere(1e5, Vector3(50, 40.8, -1e5 + 170), Vector3(), Vector3(), Reflection_t::Diffuse), //Front\n", + " Sphere(1e5, Vector3(50, 1e5, 81.6), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Bottom\n", + " Sphere(1e5, Vector3(50, -1e5 + 81.6, 81.6), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Top\n", + " Sphere(16.5, Vector3(27, 16.5, 47), Vector3(), Vector3(0.999), Reflection_t::Specular), //Mirror\n", + " Sphere(16.5, Vector3(73, 16.5, 78), Vector3(), Vector3(0.999), Reflection_t::Refractive), //Glass\n", + " Sphere(600, Vector3(50, 681.6 - .27, 81.6), Vector3(12), Vector3(), Reflection_t::Diffuse) //Light\n", + " };\n", + "\n", + " __device__ inline Vector3 UniformSampleOnHemisphere(double u1, double u2) {\n", + " // u1 := cos_theta\n", + " const double sin_theta = std::sqrt(fmax(0.0, 1.0 - u1 * u1));\n", + " const double phi = 2.0 * g_pi * u2;\n", + " return {\n", + " std::cos(phi) * sin_theta,\n", + " std::sin(phi) * sin_theta,\n", + " u1\n", + " };\n", + " }\n", + "\n", + " __device__ inline Vector3 CosineWeightedSampleOnHemisphere(double u1, double u2) {\n", + " const double cos_theta = sqrt(1.0 - u1);\n", + " const double sin_theta = sqrt(u1);\n", + " const double phi = 2.0 * g_pi * u2;\n", + " return {\n", + " std::cos(phi) * sin_theta,\n", + " std::sin(phi) * sin_theta,\n", + " cos_theta\n", + " };\n", + " }\n", + " \n", + " __device__ inline double Reflectance0(double n1, double n2) {\n", + " const double sqrt_R0 = (n1 - n2) / (n1 + n2);\n", + " return sqrt_R0 * sqrt_R0;\n", + " }\n", + "\n", + " __device__ inline double SchlickReflectance(double n1, \n", + " double n2, \n", + " double c) {\n", + " const double R0 = Reflectance0(n1, n2);\n", + " return R0 + (1.0 - R0) * c * c * c * c * c;\n", + " }\n", + "\n", + " __device__ inline const Vector3 IdealSpecularReflect(const Vector3& d, const Vector3& n) {\n", + " return d - 2.0 * n.Dot(d) * n;\n", + " }\n", + "\n", + " __device__ inline const Vector3 IdealSpecularTransmit(const Vector3& d, \n", + " const Vector3& n, \n", + " double n_out, \n", + " double n_in, \n", + " double& pr, \n", + " curandState* state) {\n", + " \n", + " const Vector3 d_Re = IdealSpecularReflect(d, n);\n", + "\n", + " const bool out_to_in = (0.0 > n.Dot(d));\n", + " const Vector3 nl = out_to_in ? n : -n;\n", + " const double nn = out_to_in ? n_out / n_in : n_in / n_out;\n", + " const double cos_theta = d.Dot(nl);\n", + " const double cos2_phi = 1.0 - nn * nn * (1.0 - cos_theta * cos_theta);\n", + "\n", + " // Total Internal Reflection\n", + " if (0.0 > cos2_phi) {\n", + " pr = 1.0;\n", + " return d_Re;\n", + " }\n", + "\n", + " const Vector3 d_Tr = Normalize(nn * d - nl * (nn * cos_theta + sqrt(cos2_phi)));\n", + " const double c = 1.0 - (out_to_in ? -cos_theta : d_Tr.Dot(n));\n", + "\n", + " const double Re = SchlickReflectance(n_out, n_in, c);\n", + " const double p_Re = 0.25 + 0.5 * Re;\n", + " if (curand_uniform_double(state) < p_Re) {\n", + " pr = (Re / p_Re);\n", + " return d_Re;\n", + " }\n", + " else {\n", + " const double Tr = 1.0 - Re;\n", + " const double p_Tr = 1.0 - p_Re;\n", + " pr = (Tr / p_Tr);\n", + " return d_Tr;\n", + " }\n", + " }\n", + "\n", + " __device__ inline bool Intersect(const Sphere* dev_spheres, \n", + " std::size_t nb_spheres, \n", + " const Ray& ray, \n", + " size_t& id)\n", + " {\n", + " bool hit = false;\n", + " for (std::size_t i = 0u; i < nb_spheres; ++i) {\n", + " if (dev_spheres[i].Intersect(ray)) {\n", + " hit = true;\n", + " id = i;\n", + " }\n", + " }\n", + "\n", + " return hit;\n", + " }\n", + "\n", + " __device__ static Vector3 Radiance(const Sphere* dev_spheres, \n", + " std::size_t nb_spheres,\n", + " const Ray& ray, \n", + " curandState* state)\n", + " { \n", + " Ray r = ray;\n", + " Vector3 L;\n", + " Vector3 F(1.0);\n", + "\n", + " while (true) {\n", + " std::size_t id;\n", + " if (!Intersect(dev_spheres, nb_spheres, r, id)) {\n", + " return L;\n", + " }\n", + "\n", + " const Sphere& shape = dev_spheres[id];\n", + " const Vector3 p = r(r.m_tmax);\n", + " const Vector3 n = Normalize(p - shape.m_p);\n", + "\n", + " L += F * shape.m_e;\n", + " F *= shape.m_f;\n", + "\n", + " // Russian roulette\n", + " if (4 < r.m_depth) {\n", + " const double continue_probability = shape.m_f.Max();\n", + " if (curand_uniform_double(state) >= continue_probability) {\n", + " return L;\n", + " }\n", + " F /= continue_probability;\n", + " }\n", + "\n", + " // Next path segment\n", + " switch (shape.m_reflection_t) {\n", + " \n", + " case Reflection_t::Specular: {\n", + " const Vector3 d = IdealSpecularReflect(r.m_d, n);\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " break;\n", + " }\n", + " \n", + " case Reflection_t::Refractive: {\n", + " double pr;\n", + " const Vector3 d = IdealSpecularTransmit(r.m_d, n, REFRACTIVE_INDEX_OUT, REFRACTIVE_INDEX_IN, pr, state);\n", + " F *= pr;\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " break;\n", + " }\n", + " \n", + " default: {\n", + " const Vector3 w = (0.0 > n.Dot(r.m_d)) ? n : -n;\n", + " const Vector3 u = Normalize((abs(w.m_x) > 0.1 ? Vector3(0.0, 1.0, 0.0) : Vector3(1.0, 0.0, 0.0)).Cross(w));\n", + " const Vector3 v = w.Cross(u);\n", + "\n", + " const Vector3 sample_d = CosineWeightedSampleOnHemisphere(curand_uniform_double(state), curand_uniform_double(state));\n", + " const Vector3 d = Normalize(sample_d.m_x * u + sample_d.m_y * v + sample_d.m_z * w);\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " }\n", + " }\n", + " }\n", + " }\n", + "\n", + " __global__ static void kernel(const Sphere* dev_spheres, \n", + " std::size_t nb_spheres,\n", + " std::uint32_t w, \n", + " std::uint32_t h, \n", + " Vector3* Ls, \n", + " std::uint32_t nb_samples)\n", + " {\n", + " const std::uint32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const std::uint32_t y = threadIdx.y + blockIdx.y * blockDim.y;\n", + " const std::uint32_t offset = x + y * blockDim.x * gridDim.x;\n", + "\n", + " if (x >= w || y >= h) {\n", + " return;\n", + " }\n", + "\n", + " // RNG\n", + " curandState state;\n", + " curand_init(offset, 0u, 0u, &state);\n", + "\n", + " const Vector3 eye = { 50.0, 52.0, 295.6 };\n", + " const Vector3 gaze = Normalize(Vector3(0.0, -0.042612, -1.0));\n", + " const double fov = 0.5135;\n", + " const Vector3 cx = { w * fov / h, 0.0, 0.0 };\n", + " const Vector3 cy = Normalize(cx.Cross(gaze)) * fov;\n", + "\n", + " for (std::size_t sy = 0u, i = (h - 1u - y) * w + x; sy < 2u; ++sy) { // 2 subpixel row\n", + "\n", + " for (std::size_t sx = 0u; sx < 2u; ++sx) { // 2 subpixel column\n", + "\n", + " Vector3 L;\n", + "\n", + " for (std::size_t s = 0u; s < nb_samples; ++s) { // samples per subpixel\n", + " const double u1 = 2.0 * curand_uniform_double(&state);\n", + " const double u2 = 2.0 * curand_uniform_double(&state);\n", + " const double dx = (u1 < 1.0) ? sqrt(u1) - 1.0 : 1.0 - sqrt(2.0 - u1);\n", + " const double dy = (u2 < 1.0) ? sqrt(u2) - 1.0 : 1.0 - sqrt(2.0 - u2);\n", + " const Vector3 d = cx * (((sx + 0.5 + dx) * 0.5 + x) / w - 0.5) +\n", + " cy * (((sy + 0.5 + dy) * 0.5 + y) / h - 0.5) + gaze;\n", + " \n", + " L += Radiance(dev_spheres, nb_spheres, \n", + " Ray(eye + d * 130, Normalize(d), EPSILON_SPHERE), &state) \n", + " * (1.0 / nb_samples);\n", + " }\n", + " \n", + " Ls[i] += 0.25 * Clamp(L);\n", + " }\n", + " }\n", + " }\n", + "\n", + " static void Render(std::uint32_t nb_samples) noexcept\n", + " {\n", + " const std::uint32_t w = 256u; //1024u;\n", + " const std::uint32_t h = 192u; //768u;\n", + " const std::uint32_t nb_pixels = w * h;\n", + "\n", + " // Set up device memory\n", + " Sphere* dev_spheres;\n", + " HANDLE_ERROR(cudaMalloc((void**)&dev_spheres, sizeof(g_spheres)));\n", + " HANDLE_ERROR(cudaMemcpy(dev_spheres, g_spheres, sizeof(g_spheres), cudaMemcpyHostToDevice));\n", + " Vector3* dev_Ls;\n", + " HANDLE_ERROR(cudaMalloc((void**)&dev_Ls, nb_pixels * sizeof(Vector3)));\n", + " HANDLE_ERROR(cudaMemset(dev_Ls, 0, nb_pixels * sizeof(Vector3)));\n", + "\n", + " // Kernel execution\n", + " const dim3 nblocks(w / 16u, h / 16u);\n", + " const dim3 nthreads(16u, 16u);\n", + " kernel<<>>(dev_spheres, sizeof(g_spheres)/sizeof(g_spheres[0]), w, h, dev_Ls, nb_samples);\n", + "\n", + " // Set up host memory\n", + " Vector3* Ls = (Vector3*)malloc(nb_pixels * sizeof(Vector3));\n", + " // Transfer device -> host\n", + " HANDLE_ERROR(cudaMemcpy(Ls, dev_Ls, nb_pixels * sizeof(Vector3), cudaMemcpyDeviceToHost));\n", + "\n", + " // Clean up device memory\n", + " HANDLE_ERROR(cudaFree(dev_Ls));\n", + " HANDLE_ERROR(cudaFree(dev_spheres));\n", + "\n", + " //WritePPM(w, h, Ls);\n", + " save(\"cu-smallpt.png\", w, h, Ls);\n", + "\n", + " // Clean up host memory\n", + " free(Ls);\n", + " }\n", + "}\n", + "\n", + "void devicePropertyPrint()\n", + "{\n", + " int dev = 0;\n", + " cudaDeviceProp devProp;\n", + " if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n", + " {\n", + " printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n", + " printf(\"Device compute capability: %i - %i\\n\", devProp.minor, devProp.major);\n", + " printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n", + " printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n", + " printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n", + " printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n", + " printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n", + " printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n", + " }\n", + "}\n", + "\n", + "devicePropertyPrint();\n", + "const std::uint32_t nb_samples = 100;\n", + "smallpt::Render(nb_samples);" + ] + }, + { + "cell_type": "markdown", + "id": "a2e81c14-6630-4c0e-b8dd-8ee1f2ecfdd7", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec676c73-d492-494c-b089-85f7034cfe50", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "from IPython.display import Image" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d82489e-27ed-4324-a407-cc312bb84281", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "Image(filename='cu-smallpt.png')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae6231e8-105e-47f4-8eae-c6072b44a613", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \"nlohmann/json.hpp\"\n", + "#include \"xtl/xbase64.hpp\"\n", + "#include \"xeus/xinterpreter.hpp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "777222e4-6175-408c-9e1b-b1c110f89377", + "metadata": {}, + "outputs": [], + "source": [ + "void show_image() {\n", + " std::ifstream fin(\"cu-smallpt.png\", std::ios::binary); \n", + " std::stringstream m_buffer;\n", + " m_buffer << fin.rdbuf();\n", + " auto bundle = nlohmann::json::object();\n", + " bundle[\"image/png\"] = xtl::base64encode(m_buffer.str());\n", + " ////\n", + " //bundle[\"text/html\"] = \"

test

\";\n", + " // std::string(\"\";\n", + " ////\n", + " //printf(\"%s\\n\", bundle.dump().c_str());\n", + " xeus::get_interpreter().display_data(std::move(bundle), nlohmann::json::object(), nlohmann::json::object());\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7369364-526f-4a4c-81c5-d0249e03b6ef", + "metadata": {}, + "outputs": [], + "source": [ + "show_image();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14c154fe-9d18-4c4b-bc82-eeef5a310375", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CppInterOp (C++17)", + "language": "CUDA", + "name": "cppinterop-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/eigen_demo/cppyy_caas_tests.ipynb b/notebooks/eigen_demo/cppyy_caas_tests.ipynb new file mode 100644 index 00000000..5144b5e0 --- /dev/null +++ b/notebooks/eigen_demo/cppyy_caas_tests.ipynb @@ -0,0 +1,206 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "1170e2e2-0b77-4d70-afab-3069af8d0528", + "metadata": {}, + "outputs": [], + "source": [ + "#include\n", + "#include" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9af566fb-a31f-4d0b-ad7b-69383b2b0940", + "metadata": {}, + "outputs": [], + "source": [ + "namespace VoidPtrArray {\n", + " typedef struct _name {\n", + " _name() { p[0] = (void*)0x1; p[1] = (void*)0x2; p[2] = (void*)0x3; }\n", + " void* p[3];\n", + " } name;\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ce6a3448-50ee-483c-8c67-b3e3fd9f6c8a", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "n = cppyy.gbl.VoidPtrArray.name()\n", + "assert n.p[0] == 0x1\n", + "assert n.p[1] == 0x2\n", + "assert n.p[2] == 0x3" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "9dde9a47-48a7-4cae-90db-d489d61a1c51", + "metadata": {}, + "outputs": [], + "source": [ + "namespace VectorConstCharStar {\n", + " std::vector test = {\"hello\"};\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4c45ea0c-3634-43a3-bae9-c1972e6b4762", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import cppyy\n", + "ns = cppyy.gbl.VectorConstCharStar\n", + "\n", + "assert len(ns.test) == 1\n", + "assert ns.test[0] == \"hello\"\n", + "\n", + "ns.test.push_back(\"world\")\n", + "assert len(ns.test) == 2\n", + "assert ns.test[0] == \"hello\"\n", + "assert ns.test[1] == \"world\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d5699a3b-94ce-4b1a-96f5-2b3264e0cabb", + "metadata": {}, + "outputs": [], + "source": [ + "namespace ArrayLike {\n", + "struct __attribute__((__packed__)) Vector3f {\n", + " float x, y, z;\n", + "}; }\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8bff7898-1265-4341-941e-3d1810b06abd", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "N = 5\n", + "\n", + "v = cppyy.gbl.std.vector['ArrayLike::Vector3f'](N)\n", + "\n", + "for i in range(N):\n", + " d = v[i]\n", + " d.x, d.y, d.z = i, i*N, i*N**2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bbc3c5a0-7b0b-46ae-a576-7e5721b0883a", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "for i in range(N):\n", + " d = v[i]\n", + " assert d.x == float(i)\n", + " assert d.y == float(i*N)\n", + " assert d.z == float(i*N**2)\n" + ] + }, + { + "cell_type": "markdown", + "id": "15727205-0392-4fbd-a7e5-8cdb73f37ab1", + "metadata": {}, + "source": [ + "#### STL-like class with preinc by-ref returns" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "65ec9f7d-40a6-44c4-a944-6182ea7f8b02", + "metadata": {}, + "outputs": [], + "source": [ + "namespace PreIncrement {\n", + " struct Token {\n", + " int value;\n", + " };\n", + "\n", + "struct Iterator {\n", + " Token current;\n", + "\n", + " bool operator!=(const Iterator& rhs) {\n", + " return rhs.current.value != current.value; }\n", + " const Token& operator*() { return current; }\n", + " Iterator& operator++() {\n", + " current.value++;\n", + " return *this;\n", + " }\n", + "};\n", + "\n", + "struct Stream {\n", + " Iterator begin() { return Iterator(); }\n", + " Iterator end() { return Iterator{10}; }\n", + "}; }" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c4ed4321-6b9b-4531-8d79-ee00f7d46e3a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"\", line 5, in \n", + "TypeError: PreIncrement::Iterator PreIncrement::Stream::beginIterator PreIncrement::Stream::begin() =>\n", + " TypeError: unbound method Stream::begin must be called with a Stream instance as first argument\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "import cppyy\n", + "ns = cppyy.gbl.PreIncrement\n", + "\n", + "stream = ns.Stream\n", + "stream.begin(stream)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CUDA (C++17)", + "language": "CUDA", + "name": "cuda-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/eigen_demo/eigen_demo.ipynb b/notebooks/eigen_demo/eigen_demo.ipynb new file mode 100644 index 00000000..e36c9665 --- /dev/null +++ b/notebooks/eigen_demo/eigen_demo.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c9eda5c-bf2f-4bcd-8dee-bc7db08dd801", + "metadata": {}, + "source": [ + "### Here we include the **conda** installed library using Cppyy" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b172dcbe-7a72-4d1a-8e70-5955dec48735", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting package metadata (current_repodata.json): ...working... done\n", + "Solving environment: ...working... done\n", + "\n", + "\n", + "==> WARNING: A newer version of conda exists. <==\n", + " current version: 22.9.0\n", + " latest version: 23.7.4\n", + "\n", + "Please update conda by running\n", + "\n", + " $ conda update -n base -c conda-forge conda\n", + "\n", + "\n", + "\n", + "# All requested packages already installed.\n", + "\n", + "Retrieving notices: ...working... done\n" + ] + } + ], + "source": [ + "!conda install -y -c conda-forge eigen" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "01d6e223-c6bc-4255-bd2f-a2deb68ea06d", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "Cpp::AddIncludePath(\"/opt/conda/envs/.venv/include\");" + ] + }, + { + "cell_type": "markdown", + "id": "22e7b3ed-747e-4bd5-8b89-4a6e5661dfbf", + "metadata": {}, + "source": [ + "### We are immediately able to use the `Eigen::Dense` namespaces in our C++ code cell" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "815bd42b-024a-49c7-9557-fa5bfd8b4496", + "metadata": {}, + "outputs": [], + "source": [ + "#include \"/opt/conda/envs/.venv/include/eigen3/Eigen/Dense\"\n", + "\n", + "typedef Eigen::Matrix MatrixXd;\n", + "\n", + "Eigen::Vector2d a;\n", + "Eigen::Vector3d b;\n", + "Eigen::Vector4d c;" + ] + }, + { + "cell_type": "markdown", + "id": "0766e5c0-2d3b-4ae4-96bd-4a78f3b02cc2", + "metadata": {}, + "source": [ + "### We can define a templated class with a set of vector operations offered by Eigen " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bca71004-dec4-4571-8a89-c9c9a54aaeee", + "metadata": {}, + "outputs": [], + "source": [ + "template \n", + "class EigenOperations {\n", + "public:\n", + " static VectorType PerformOperations(const VectorType& v) {\n", + " VectorType result = v;\n", + " \n", + " // Using Eigen::Dense object methods\n", + " \n", + " result.normalize(); // eigen vector normalisation\n", + "\n", + " double dot = v.dot(v); // dot product\n", + " for (int i = 0; i < result.size(); i++) {\n", + " result[i] += dot;\n", + " }\n", + " \n", + " double squaredNorm = v.squaredNorm(); // vector squared norm \n", + " for (int i = 0; i < result.size(); i++) {\n", + " result[i] -= squaredNorm;\n", + " }\n", + "\n", + " return result;\n", + " }\n", + "};\n" + ] + }, + { + "cell_type": "markdown", + "id": "de00d39d-8156-4ecb-a932-924661e7b80e", + "metadata": {}, + "source": [ + "### Accessing the declared Eigen vectors on the Python side using Cppyy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9b7e7fee-1edd-427c-a738-0cdf941200c1", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import cppyy\n", + "\n", + "a = cppyy.gbl.a\n", + "b = cppyy.gbl.b\n", + "c = cppyy.gbl.c\n", + "\n", + "def display_eigen(vec, name):\n", + " print(\"Vector : \", name, \", Dimensions : \", len(vec))\n", + " for x in vec:\n", + " print(x)" + ] + }, + { + "cell_type": "markdown", + "id": "07cfc277-4202-47b6-9859-dd4a651bd4a1", + "metadata": {}, + "source": [ + "#### Setting values to the C++ Eigen Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3775e315-652d-4dea-8729-e36549524b62", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "for i in range(len(a)):\n", + " a[i] = i - 1\n", + "\n", + "for i in range(len(b)):\n", + " b[i] = i + 1\n", + "\n", + "for i in range(len(c)):\n", + " c[i] = i - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ca77d6ff-3428-4977-9072-9f2f2a904973", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector : A , Dimensions : 2\n", + "-1.0\n", + "0.0\n", + "Vector : B , Dimensions : 3\n", + "1.0\n", + "2.0\n", + "3.0\n", + "Vector : C , Dimensions : 4\n", + "-1.0\n", + "0.0\n", + "1.0\n", + "2.0\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "display_eigen(a, \"A\")\n", + "display_eigen(b, \"B\")\n", + "display_eigen(c, \"C\")" + ] + }, + { + "cell_type": "markdown", + "id": "e8c29a51-9897-4e16-9f6e-bacf918129cb", + "metadata": {}, + "source": [ + "### Calling the Templated method of the C++ class from the Python side" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a01b4c6d-88ea-4cb1-a17e-716e1d813b1b", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "res2d = cppyy.gbl.EigenOperations[\"Eigen::Vector2d\"].PerformOperations(a)\n", + "res3d = cppyy.gbl.EigenOperations[\"Eigen::Vector3d\"].PerformOperations(b)\n", + "res4d = cppyy.gbl.EigenOperations[\"Eigen::Vector4d\"].PerformOperations(c)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "680cc0d4-3d7b-441c-8f71-3acdf1b9b3e2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "876beb2f-113e-4889-bd10-3e460d17181e", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "\n", + "origin = [0, 0]\n", + "\n", + "np_res2 = np.array(list(res2d))\n", + "np_res3 = np.array(list(res3d)) \n", + "np_res4 = np.array(list(res4d)) \n", + "\n", + "plt.quiver(*origin, *np_res2, color=['r'], scale=5)\n", + "plt.quiver(*origin, *np_res3, color=['b'], scale=5)\n", + "\n", + "\n", + "plt.savefig(\"test.jpg\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fc77ec01-bd62-4a87-b2e1-76141623fc5c", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4078ec69-0b4d-4350-8e6b-2d793bc27491", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vector : A , Dimensions : 2\n", + "-1.0\n", + "0.0\n", + "Vector : B , Dimensions : 3\n", + "0.26726124191242384\n", + "0.5345224838248495\n", + "0.8017837257372733\n", + "Vector : C , Dimensions : 4\n", + "-0.40824829046386313\n", + "0.0\n", + "0.40824829046386313\n", + "0.8164965809277263\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "display_eigen(res2d, \"A\")\n", + "display_eigen(res3d, \"B\")\n", + "display_eigen(res4d, \"C\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++17", + "language": "C++17", + "name": "xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/eigen_demo/test.jpg b/notebooks/eigen_demo/test.jpg new file mode 100644 index 00000000..ff7a9422 Binary files /dev/null and b/notebooks/eigen_demo/test.jpg differ diff --git a/notebooks/image_CUDA_demo/img_in.jpg b/notebooks/image_CUDA_demo/img_in.jpg new file mode 100644 index 00000000..08fc8254 Binary files /dev/null and b/notebooks/image_CUDA_demo/img_in.jpg differ diff --git a/notebooks/image_CUDA_demo/img_out.jpg b/notebooks/image_CUDA_demo/img_out.jpg new file mode 100644 index 00000000..10ed414f Binary files /dev/null and b/notebooks/image_CUDA_demo/img_out.jpg differ diff --git a/notebooks/image_CUDA_demo/threshold_test.ipynb b/notebooks/image_CUDA_demo/threshold_test.ipynb new file mode 100644 index 00000000..9e59283c --- /dev/null +++ b/notebooks/image_CUDA_demo/threshold_test.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9ba0128f-ef46-4b86-a4cd-02fba18d6516", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c63a0cf7-c79b-4281-8b76-e0ccad6c7b93", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void thresholdKernel(float* input, float* output, const int width, const int height) {\n", + " const unsigned int col = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const unsigned int row = threadIdx.y + blockIdx.y * blockDim.y;\n", + " if (row < height && col < width) {\n", + " \n", + " if(input[col + row * width] > 200)\n", + " output[col + row * width] = input[col + row * width] * 2;\n", + " else\n", + " output[col + row * width] = input[col + row * width] * 0.4;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c7454511-0b12-4022-ab97-c73f50c3c1f8", + "metadata": {}, + "outputs": [], + "source": [ + "const int width = 512; \n", + "const int height = 512;" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c639e359-7481-4fe0-8dfd-860885fd9044", + "metadata": {}, + "outputs": [], + "source": [ + "float* h_input = new float[width * height];\n", + "float* h_output = new float[width * height];" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bf055a82-9dd8-4961-a271-b3e5a144c909", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: Pillow in /opt/conda/envs/.venv/lib/python3.10/site-packages (10.0.1)\n" + ] + } + ], + "source": [ + "!pip install Pillow" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d7c9c2b7-45be-4d22-90f9-4d61042e353f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy in /opt/conda/envs/.venv/lib/python3.10/site-packages (1.26.0)\n" + ] + } + ], + "source": [ + "!pip install numpy" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ac00db7b-867a-4ff0-8afa-2c13006b9f32", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install matplotlib" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a872a8cd-117b-4238-a4ea-e31e9dac48f3", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "from PIL import Image\n", + "import numpy as np\n", + "\n", + "\n", + "image = Image.open('img_in.jpg') \n", + "image = image.resize((512, 512))\n", + "\n", + "image_array = np.array(image)\n", + "if len(image_array.shape) == 3 and image_array.shape[2] == 3:\n", + " image_array = np.array(image.convert('L'))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6a1cd914-db44-4e34-a9fb-5d2100b298b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[126 126 125 ... 84 84 84]\n", + " [126 126 125 ... 84 84 84]\n", + " [127 126 126 ... 84 84 84]\n", + " ...\n", + " [ 58 55 63 ... 19 15 8]\n", + " [ 57 56 53 ... 23 17 16]\n", + " [ 55 50 58 ... 5 16 16]]\n" + ] + } + ], + "source": [ + "%%python\n", + "\n", + "print(image_array)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7545b5c9-c901-4599-abde-3ae5ac17e160", + "metadata": {}, + "outputs": [], + "source": [ + "void displayImgArray(float* input) {\n", + " for (int i = 0; i < 3; i++) {\n", + " std::cout << input[i] << \" \"; \n", + " }\n", + "\n", + " std::cout << \" ... \";\n", + "\n", + " for (int i = width * height - 3; i < width * height; i++) {\n", + " std::cout << input[i] << \" \"; \n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a3c74a47-399e-4454-89f6-f4e8b0376a68", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import cppyy\n", + "\n", + "img_list = image_array.flatten().tolist()\n", + "img_vector = cppyy.gbl.std.vector['float'](img_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ab6c16bd-c44a-4e4e-985a-12c5ca3256a8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "void setImg(const std::vector& input) {\n", + " if (h_input != nullptr) {\n", + " delete[] h_input;\n", + " }\n", + "\n", + " h_input = new float[input.size()];\n", + "\n", + " for (size_t i = 0; i < input.size(); i++) {\n", + " h_input[i] = input[i]; // No casting needed\n", + " }\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a168bbab-fc67-41f5-85c1-f9980d75a00b", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector getOutput() {\n", + " std::vector res;\n", + " for (size_t i = 0; i < width * height; i++) {\n", + " res.push_back(h_output[i]);\n", + " }\n", + " return res;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2eec8b62-5500-45e4-9c02-9bc587288206", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "cppyy.gbl.setImg(img_vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fd1bef06-d14e-4c82-8fa5-da14df696fe5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "126 126 125 ... 5 16 16 " + ] + } + ], + "source": [ + "displayImgArray(h_input);" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ea90464f-ed8f-44f1-9336-99aa3dc85149", + "metadata": {}, + "outputs": [], + "source": [ + "float* d_input;\n", + "float* d_output;\n", + "\n", + "cudaMalloc((void**)&d_input, width * height * sizeof(float));\n", + "cudaMalloc((void**)&d_output, width * height * sizeof(float));\n", + "\n", + "cudaMemcpy(d_input, h_input, width * height * sizeof(float), cudaMemcpyHostToDevice);\n", + "\n", + "dim3 dimBlock(16, 16);\n", + "dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);\n", + "\n", + "thresholdKernel<<>>(d_input, d_output, width, height);\n", + "\n", + "cudaMemcpy(h_output, d_output, width * height * sizeof(float), cudaMemcpyDeviceToHost);\n", + "\n", + "cudaFree(d_input);\n", + "cudaFree(d_output)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "7bf09ac1-ac6b-45e9-93c2-d4ae50897c2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "50.4 50.4 50 ... 2 6.4 6.4 " + ] + } + ], + "source": [ + "displayImgArray(h_output);" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3a97a53f-c2b7-4e24-a299-a6c0ec81b16e", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector blurredRes = getOutput();" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "2586f437-295d-4b33-b18e-6f28628a7f81", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "k = np.array(cppyy.gbl.blurredRes, dtype = np.uint8)\n", + "k.resize(512, 512)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a8104074-7398-4653-a81a-827dd81a16b3", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "Image.fromarray(k).save(\"img_out.jpg\")" + ] + }, + { + "cell_type": "markdown", + "id": "d0d2e15b-5a5f-4ed8-8f17-3fd1f3fc4daf", + "metadata": {}, + "source": [ + "\n", + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CUDA (C++17)", + "language": "CUDA", + "name": "cuda-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/images/xeus-cpp.png b/notebooks/images/xeus-cpp.png new file mode 100644 index 00000000..c5f9c11b Binary files /dev/null and b/notebooks/images/xeus-cpp.png differ diff --git a/notebooks/index.ipynb b/notebooks/index.ipynb new file mode 100644 index 00000000..5222ed54 --- /dev/null +++ b/notebooks/index.ipynb @@ -0,0 +1,721 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![xeus-cpp](images/xeus-cpp.png)](https://github.com/compiler-research/xeus-cpp)\n", + "\n", + "A Jupyter kernel for C++ based on the `clang-repl` C++ interpreter and the `xeus` native implementation of the Jupyter protocol, xeus.\n", + "\n", + "- GitHub repository: https://github.com/compiler-research/xeus-cpp\n", + "- Online documentation: https://xeus-cpp.readthedocs.io/" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + " To run the selected code cell, hit
Shift + Enter
\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Output and error streams\n", + "\n", + "`std::cout` and `std::cerr` are redirected to the notebook frontend." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "some output\n" + ] + } + ], + "source": [ + "#include \n", + "\n", + "std::cout << \"some output\" << std::endl;" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "some error\n" + ] + } + ], + "source": [ + "std::cerr << \"some error\" << std::endl;" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "ename": "Standard Exception", + "evalue": "Unknown exception", + "output_type": "error", + "traceback": [ + "Standard Exception Unknown exception" + ] + } + ], + "source": [ + "throw std::runtime_error(\"Unknown exception\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Omitting the `;` in the last statement of a cell results in an output being printed" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "int j = 5;" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//j" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "# Interpreting the C++ programming language\n", + "\n", + "`clang-repl` has a broad support of the features of C++. You can define functions, classes, templates, etc ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Functions" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "double sqr(double a)\n", + "{\n", + " return a * a;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "double a = 2.5;\n", + "double asqr = sqr(a);\n", + "//// Fix:\n", + "//asqr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "class Foo\n", + "{\n", + "public:\n", + "\n", + " virtual ~Foo() {}\n", + " \n", + " virtual void print(double value) const\n", + " {\n", + " std::cout << \"Foo value = \" << value << std::endl;\n", + " }\n", + "};" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Foo value = 1.2\n" + ] + } + ], + "source": [ + "Foo bar;\n", + "bar.print(1.2);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Polymorphism" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "class Bar : public Foo\n", + "{\n", + "public:\n", + "\n", + " virtual ~Bar() {}\n", + " \n", + " virtual void print(double value) const\n", + " {\n", + " std::cout << \"Bar value = \" << 2 * value << std::endl;\n", + " }\n", + "};" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bar value = 2.4\n" + ] + } + ], + "source": [ + "Foo* bar2 = new Bar;\n", + "bar2->print(1.2);\n", + "delete bar2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Templates" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "\n", + "template \n", + "class FooT\n", + "{\n", + "public:\n", + " \n", + " explicit FooT(const T& t) : m_t(t) {}\n", + " \n", + " void print() const\n", + " {\n", + " std::cout << typeid(T).name() << \" m_t = \" << m_t << std::endl;\n", + " }\n", + " \n", + "private:\n", + " \n", + " T m_t;\n", + "};" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d m_t = 1.2\n" + ] + } + ], + "source": [ + "FooT foot(1.2);\n", + "foot.print();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## C++11 / C++14 support" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "class Foo11\n", + "{\n", + "public:\n", + " \n", + " Foo11() { std::cout << \"Foo11 default constructor\" << std::endl; }\n", + " Foo11(const Foo11&) { std::cout << \"Foo11 copy constructor\" << std::endl; }\n", + " Foo11(Foo11&&) { std::cout << \"Foo11 move constructor\" << std::endl; }\n", + "};" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Foo11 default constructor\n", + "Foo11 copy constructor\n", + "Foo11 move constructor\n" + ] + } + ], + "source": [ + "Foo11 f1;\n", + "Foo11 f2(f1);\n", + "Foo11 f3(std::move(f1));" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "\n", + "std::vector v = { 1, 2, 3 };\n", + "auto iter = ++v.begin();\n", + "//// Fix:\n", + "//v" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//*iter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "... and also lambda, universal references, `decltype`, etc ..." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Documentation and completion\n", + "\n", + " - Documentation for types of the standard library is retrieved on cppreference.com.\n", + " - The quick-help feature can also be enabled for user-defined types and third-party libraries. More documentation on this feature is available at https://xeus-cpp.readthedocs.io/en/latest/inline_help.html.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//?std::vector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using the `display_data` mechanism" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For a user-defined type `T`, the rich rendering in the notebook and JupyterLab can be enabled by by implementing the function `nl::json mime_bundle_repr(const T& im)`, which returns the JSON mime bundle for that type.\n", + "\n", + "More documentation on the rich display system of Jupyter and Xeus-cpp is available at https://xeus-cpp.readthedocs.io/en/latest/rich_display.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Image example" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "\n", + "#include \"nlohmann/json.hpp\"\n", + "\n", + "#include \"xtl/xbase64.hpp\"\n", + "\n", + "namespace nl = nlohmann;\n", + "\n", + "namespace im\n", + "{\n", + " struct image\n", + " { \n", + " inline image(const std::string& filename)\n", + " {\n", + " std::ifstream fin(filename, std::ios::binary); \n", + " m_buffer << fin.rdbuf();\n", + " }\n", + " \n", + " std::stringstream m_buffer;\n", + " };\n", + " \n", + " nl::json mime_bundle_repr(const image& i)\n", + " {\n", + " auto bundle = nl::json::object();\n", + " bundle[\"image/png\"] = xtl::base64encode(i.m_buffer.str());\n", + " return bundle;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "im::image marie(\"images/marie.png\");\n", + "//// Fix:\n", + "//marie" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Audio example" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "\n", + "#include \"nlohmann/json.hpp\"\n", + "\n", + "#include \"xtl/xbase64.hpp\"\n", + "\n", + "namespace nl = nlohmann;\n", + "\n", + "namespace au\n", + "{\n", + " struct audio\n", + " { \n", + " inline audio(const std::string& filename)\n", + " {\n", + " std::ifstream fin(filename, std::ios::binary); \n", + " m_buffer << fin.rdbuf();\n", + " }\n", + " \n", + " std::stringstream m_buffer;\n", + " };\n", + " \n", + " nl::json mime_bundle_repr(const audio& a)\n", + " {\n", + " auto bundle = nl::json::object();\n", + " bundle[\"text/html\"] =\n", + " std::string(\"\";\n", + " return bundle;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "au::audio drums(\"audio/audio.wav\");\n", + "//// Fix:\n", + "//drums" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Display" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//#include \"xcpp/xdisplay.hpp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//xcpp::display(drums);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Update-display" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "//// Fix:\n", + "//#include \"xcpp/xdisplay.hpp\"\n", + "\n", + "#include \"nlohmann/json.hpp\"\n", + "\n", + "namespace nl = nlohmann;\n", + "\n", + "namespace ht\n", + "{\n", + " struct html\n", + " { \n", + " inline html(const std::string& content)\n", + " {\n", + " m_content = content;\n", + " }\n", + " std::string m_content;\n", + " };\n", + "\n", + " nl::json mime_bundle_repr(const html& a)\n", + " {\n", + " auto bundle = nl::json::object();\n", + " bundle[\"text/html\"] = a.m_content;\n", + " return bundle;\n", + " }\n", + "}\n", + "\n", + "// A blue rectangle\n", + "//// Fix:\n", + "/*ht::html rect(R\"(\n", + "
\n", + "Original\n", + "
)\");*/" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "//xcpp::display(rect, \"some_display_id\");" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "// Update the rectangle to be red\n", + "//// Fix:\n", + "/*rect.m_content = R\"(\n", + "
\n", + "Updated\n", + "
)\";*/\n", + "\n", + "//xcpp::display(rect, \"some_display_id\", true);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Clear output" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "\n", + "//// Fix:\n", + "//#include \"xcpp/xdisplay.hpp\"" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "/*std::cout << \"hello\" << std::endl;\n", + "std::this_thread::sleep_for(std::chrono::seconds(1));\n", + "xcpp::clear_output(); // will flicker when replacing \"hello\" with \"goodbye\"\n", + "std::this_thread::sleep_for(std::chrono::seconds(1));\n", + "std::cout << \"goodbye\" << std::endl;*/" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "//// Fix:\n", + "/*std::cout << \"hello\" << std::endl;\n", + "std::this_thread::sleep_for(std::chrono::seconds(1));\n", + "xcpp::clear_output(true); // prevents flickering\n", + "std::this_thread::sleep_for(std::chrono::seconds(1));\n", + "std::cout << \"goodbye\" << std::endl;*/" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++14", + "language": "C++14", + "name": "xcpp14" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "14" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/kalman_CUDA_demo/1D_KF_plot.png b/notebooks/kalman_CUDA_demo/1D_KF_plot.png new file mode 100644 index 00000000..843b4035 Binary files /dev/null and b/notebooks/kalman_CUDA_demo/1D_KF_plot.png differ diff --git a/notebooks/kalman_CUDA_demo/assets/kf_diagram.png b/notebooks/kalman_CUDA_demo/assets/kf_diagram.png new file mode 100644 index 00000000..a2b06567 Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/kf_diagram.png differ diff --git a/notebooks/kalman_CUDA_demo/assets/motion_illustration.png b/notebooks/kalman_CUDA_demo/assets/motion_illustration.png new file mode 100644 index 00000000..be8de724 Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/motion_illustration.png differ diff --git a/notebooks/kalman_CUDA_demo/assets/x_graph.png b/notebooks/kalman_CUDA_demo/assets/x_graph.png new file mode 100644 index 00000000..a50f9fe7 Binary files /dev/null and b/notebooks/kalman_CUDA_demo/assets/x_graph.png differ diff --git a/notebooks/kalman_CUDA_demo/data/measurements.yml b/notebooks/kalman_CUDA_demo/data/measurements.yml new file mode 100644 index 00000000..9e4e5ff6 --- /dev/null +++ b/notebooks/kalman_CUDA_demo/data/measurements.yml @@ -0,0 +1,146 @@ +data : +- 1.04202710058 +- 1.10726790452 +- 1.2913511148 +- 1.48485250951 +- 1.72825901034 +- 1.74216489744 +- 2.11672039768 +- 2.14529225112 +- 2.16029641405 +- 2.21269371128 +- 2.57709350237 +- 2.6682215744 +- 2.51641839428 +- 2.76034056782 +- 2.88131780617 +- 2.88373786518 +- 2.9448468727 +- 2.82866600131 +- 3.0006601946 +- 3.12920591669 +- 2.858361783 +- 2.83808170354 +- 2.68975330958 +- 2.66533185589 +- 2.81613499531 +- 2.81003612051 +- 2.88321849354 +- 2.69789264832 +- 2.4342229249 +- 2.23464791825 +- 2.30278776224 +- 2.02069770395 +- 1.94393985809 +- 1.82498398739 +- 1.52526230354 +- 1.86967808173 +- 1.18073207847 +- 1.10729605087 +- 0.916168349913 +- 0.678547664519 +- 0.562381751596 +- 0.355468474885 +- -0.155607486619 +- -0.287198661013 +- -0.602973173813 +- -0.292648661013 +- -0.602973173813 +- -0.924197686613 +- -1.2563221994129998 +- -1.5993467122129998 +- -1.9532712250129998 +- -2.3180957378129996 +- -2.6938202506129993 +- -3.0804447634129994 +- -3.4779692762129994 +- -3.8863937890129994 +- -4.305718301812999 +- -4.735942814612999 +- -5.177067327412999 +- -5.629091840212999 +- -6.0920163530129985 +- -6.565840865812999 +- -7.0505653786129985 +- -7.546189891412999 +- -8.052714404212999 +- -8.570138917012999 +- -9.098463429812998 +- -9.637687942612999 +- -10.187812455412999 +- -10.748836968212998 +- -11.320761481013 +- -11.903585993813 +- -12.497310506613 +- -13.101935019413 +- -13.717459532213 +- -14.343884045013 +- -14.981208557813002 +- -15.629433070613002 +- -16.288557583413002 +- -16.958582096213004 +- -17.639506609013004 +- -18.331331121813005 +- -19.034055634613004 +- -19.747680147413003 +- -20.472204660213006 +- -21.207629173013007 +- -21.95395368581301 +- -22.71117819861301 +- -23.47930271141301 +- -24.25832722421301 +- -25.04825173701301 +- -25.849076249813013 +- -26.660800762613015 +- -27.483425275413015 +- -28.316949788213016 +- -29.161374301013016 +- -30.01669881381302 +- -30.88292332661302 +- -31.760047839413023 +- -32.64807235221303 +- -33.54699686501303 +- -34.45682137781303 +- -35.37754589061303 +- -36.30917040341303 +- -37.251694916213026 +- -38.20511942901303 +- -39.169443941813036 +- -40.14466845461304 +- -41.13079296741304 +- -42.12781748021305 +- -43.13574199301305 +- -44.15456650581305 +- -45.18429101861305 +- -46.22491553141305 +- -47.276440044213054 +- -48.33886455701305 +- -49.41218906981305 +- -50.49641358261306 +- -51.591538095413064 +- -52.69756260821307 +- -53.81448712101307 +- -54.94231163381308 +- -56.08103614661308 +- -57.23066065941308 +- -58.391185172213085 +- -59.562609685013086 +- -60.74493419781309 +- -61.93815871061309 +- -63.14228322341309 +- -64.35730773621309 +- -65.5832322490131 +- -66.82005676181309 +- -68.0677812746131 +- -69.3264057874131 +- -70.5959303002131 +- -71.8763548130131 +- -73.1676793258131 +- -74.46990383861309 +- -75.7830283514131 +- -77.1070528642131 +- -78.4419773770131 +- -79.78780188981311 +- -81.14452640261311 +- -82.51215091541312 +- -83.89067542821311 \ No newline at end of file diff --git a/notebooks/kalman_CUDA_demo/kalman_gain_plot.png b/notebooks/kalman_CUDA_demo/kalman_gain_plot.png new file mode 100644 index 00000000..f0db97cf Binary files /dev/null and b/notebooks/kalman_CUDA_demo/kalman_gain_plot.png differ diff --git a/notebooks/kalman_CUDA_demo/position_graph.png b/notebooks/kalman_CUDA_demo/position_graph.png new file mode 100644 index 00000000..bf77cc6c Binary files /dev/null and b/notebooks/kalman_CUDA_demo/position_graph.png differ diff --git a/notebooks/kalman_CUDA_demo/run_kf.ipynb b/notebooks/kalman_CUDA_demo/run_kf.ipynb new file mode 100644 index 00000000..67db75e4 --- /dev/null +++ b/notebooks/kalman_CUDA_demo/run_kf.ipynb @@ -0,0 +1,1503 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0aa31d74-5546-4990-9dd4-c44b552443a3", + "metadata": {}, + "source": [ + "# Running a 1 Dimensional Kalman Filter using std::vector and CUDA" + ] + }, + { + "cell_type": "markdown", + "id": "8e77cdc1-2b81-4fe1-b815-86d7e05f9dab", + "metadata": {}, + "source": [ + "## System model:" + ] + }, + { + "cell_type": "markdown", + "id": "d0461c44-8d17-4e89-86ff-e9216ee51513", + "metadata": {}, + "source": [ + "| 1D LTI Projectile Motion | Position Graph with time|\n", + "| ---------------------- | ---------------------- |\n", + "| ![motion](assets/motion_illustration.png) | ![dog](assets/x_graph.png) |\n" + ] + }, + { + "cell_type": "markdown", + "id": "adc83b07-aa17-4bd7-8030-971ff8e16a5f", + "metadata": {}, + "source": [ + "## Goal : Estimate the value of Gravitational Acceleration *(g)* based on position measurements of a 1D LTI projectile\n", + "\n", + "Kalman Filter operates by maintaining an ongoing estimate of the state of a system and updating this estimate based on new measurements and a prediction model. \n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9f1f08b0-9d14-46c0-af75-4ea72af2d350", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "markdown", + "id": "c0fbb643-fde6-4623-afd5-ee4069b025c8", + "metadata": {}, + "source": [ + "### Set up the `KalmanFilter` class and constructor\n", + "\n", + " Here we create a Kalman filter with the specified matrices:\n", + " \n", + " - A - System dynamics matrix, here we model 1D motion with a *(Position x, Velocity v, Acceleration a)* model\n", + " - C - Output matrix\n", + " - Q - Covariance matrix of the process noise random variable *p(w) ~ N(0, Q)*\n", + " - R - Covariance matrix of the measurement noise random variable *p(v) ~ N(0, R)*\n", + " - P - Estimate error covariance\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e1400114-f948-43b6-a0ba-2cdb566014f8", + "metadata": {}, + "outputs": [], + "source": [ + "class KalmanFilter {\n", + "\n", + "public:\n", + "\n", + " /**\n", + " * Create a Kalman filter with the specified matrices.\n", + " * A - System dynamics matrix\n", + " * C - Output matrix\n", + " * Q - Process noise covariance\n", + " * R - Measurement noise covariance\n", + " * P - Estimate error covariance\n", + " */\n", + " KalmanFilter(\n", + " double dt,\n", + " const std::vector>& A,\n", + " const std::vector>& C,\n", + " const std::vector>& Q,\n", + " const std::vector>& R,\n", + " const std::vector>& P\n", + " );\n", + "\n", + " /**\n", + " * Create a blank estimator.\n", + " */\n", + " KalmanFilter();\n", + "\n", + " /**\n", + " * Initialize the filter with initial states as zero.\n", + " */\n", + " void init();\n", + "\n", + " /**\n", + " * Initialize the filter with a guess for initial states.\n", + " */\n", + " void init(double t0, const std::vector& x0);\n", + "\n", + " /**\n", + " * Update the estimated state based on measured values. The\n", + " * time step is assumed to remain constant.\n", + " */\n", + " std::vector update(const std::vector& y);\n", + "\n", + " /**\n", + " * Update the estimated state based on measured values,\n", + " * using the given time step and dynamics matrix.\n", + " */\n", + " void update(const std::vector& y, double dt, const std::vector>& A);\n", + "\n", + " /**\n", + " * Return the current state and time.\n", + " */\n", + " std::vector state() { return x_hat; };\n", + " double time() { return t; };\n", + "\n", + "private:\n", + "\n", + " // Matrices for computation\n", + " std::vector> A, C, Q, R, P, K, P0;\n", + "\n", + " // System dimensions\n", + " int m, n;\n", + "\n", + " // Initial and current time\n", + " double t0, t;\n", + "\n", + " // Discrete time step\n", + " double dt;\n", + "\n", + " // Is the filter initialized?\n", + " bool initialized;\n", + "\n", + " // n-size identity matrix\n", + " std::vector> I;\n", + "\n", + " // State estimates\n", + " std::vector x_hat, x_hat_new;\n", + "};\n" + ] + }, + { + "cell_type": "markdown", + "id": "dc67046f-5dcb-48c4-844b-b7f248cc8cea", + "metadata": {}, + "source": [ + "### Defining CUDA Kernel functions for parallel matrix and vector operations\n", + "\n", + "The algorithm operates in two stages of the ongoing discrete Kalman filter cycle:\n", + "\n", + " - Discrete Kalman filter time update equations:\n", + " \n", + " $\\begin{aligned} & \\hat{x}_k^{-}=A \\hat{x}_{k-1} \\\\ & P_k^{-}=A P_{k-1} A^T+Q\\end{aligned}$\n", + " \n", + " They project the state and covariance estimates forward from time step *k* – 1 to step *k*.\n", + " \n", + " - Discrete Kalman filter measurement update equations: \n", + "\n", + " $\\begin{gathered}K_k=P_k^{-} H^T\\left(H P_k^{-} H^T+R\\right)^{-1} \\\\ \\hat{x}_k=\\hat{x}_k^{-}+K_k\\left(z_k-H \\hat{x}_k^{-}\\right) \\\\ P_k=\\left(I-K_k H\\right) P_k^{-}\\end{gathered}$\n", + " \n", + " The Kalman Gain is computed in this step\n", + "\n", + "This requires us to define various CUDA kernels that handle matrix-matrix, matrix-vector, and vector-vector operations parallely:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ebcc8af4-32b2-418b-bfbe-9ffb4ff9b0ff", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void matAddKernel(double* a, double* b, double* c, int rows, int cols) {\n", + " int col = blockIdx.x * blockDim.x + threadIdx.x;\n", + " int row = blockIdx.y * blockDim.y + threadIdx.y;\n", + "\n", + " if (col < cols && row < rows) {\n", + " int idx = row * cols + col;\n", + " c[idx] = a[idx] + b[idx];\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3f7f95d9-2f3d-4600-8896-0ed4dcca1ed6", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void matSubKernel(double* a, double* b, double* c, int rows, int cols) {\n", + " int col = blockIdx.x * blockDim.x + threadIdx.x;\n", + " int row = blockIdx.y * blockDim.y + threadIdx.y;\n", + "\n", + " if (col < cols && row < rows) {\n", + " int idx = row * cols + col;\n", + " c[idx] = a[idx] - b[idx];\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "00183ab5-0380-4773-a34c-053fef7782f3", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void matTransposeKernel(double* a, double* c, int rows, int cols) {\n", + " int col = blockIdx.x * blockDim.x + threadIdx.x;\n", + " int row = blockIdx.y * blockDim.y + threadIdx.y;\n", + "\n", + " if (col < cols && row < rows) {\n", + " int idx_in = row * cols + col;\n", + " int idx_out = col * rows + row;\n", + " c[idx_out] = a[idx_in];\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d0eff8fe-34a5-4370-b6ff-118cd4d7cb22", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void matvecmulKernel(double* d_mat, double* d_vec, double* d_result, int rows, int cols) {\n", + " int tid = blockIdx.x * blockDim.x + threadIdx.x;\n", + "\n", + " if (tid < rows) {\n", + " double sum = 0.0;\n", + " for (int j = 0; j < cols; j++) {\n", + " sum += d_mat[tid * cols + j] * d_vec[j];\n", + " }\n", + " d_result[tid] = sum;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "21718df0-5f56-4122-bc1a-a368b306fdaf", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void matmulKernel(double* d_a, double* d_b, double* d_result, int rowsA, int colsA, int colsB) {\n", + " int row = blockIdx.y * blockDim.y + threadIdx.y;\n", + " int col = blockIdx.x * blockDim.x + threadIdx.x;\n", + "\n", + " if (row < rowsA && col < colsB) {\n", + " double value = 0.0;\n", + " for (int k = 0; k < colsA; k++) {\n", + " value += d_a[row * colsA + k] * d_b[k * colsB + col];\n", + " }\n", + " d_result[row * colsB + col] = value;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "18f843c9-9aee-4055-aae7-473c8f121037", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void vecsubKernel(const double* a, const double* b, double* result, int len) {\n", + " int idx = blockIdx.x * blockDim.x + threadIdx.x;\n", + " if (idx < len) {\n", + " result[idx] = a[idx] - b[idx];\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d8bb436d-238b-488f-a8a8-b8e1a4a0150d", + "metadata": {}, + "source": [ + "### Defining C++ abstraction functions to run CUDA kernels and obtain the results:\n", + "This allows us to run calls to all matrix-matrix, matrix-vector and vector-vector CUDA operations without dealing with CUDA memory and syntax in the actual algorithm code" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6ff06da4-1430-4720-affb-b9c8b1dc7a26", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> mataddCUDA(const std::vector>& a, const std::vector>& b) {\n", + " int rows = a.size();\n", + " int cols = a[0].size();\n", + " \n", + " double* h_a = new double[rows*cols];\n", + " double* h_b = new double[rows*cols];\n", + " double* h_c = new double[rows*cols];\n", + " \n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " h_a[i*cols + j] = a[i][j];\n", + " h_b[i*cols + j] = b[i][j];\n", + " }\n", + " }\n", + " \n", + " double* d_a, * d_b, * d_c;\n", + " cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n", + " cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n", + " cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n", + "\n", + " cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n", + " cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " dim3 dimBlock(16, 16);\n", + " dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n", + " matAddKernel<<>>(d_a, d_b, d_c, rows, cols);\n", + "\n", + " cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_a);\n", + " cudaFree(d_b);\n", + " cudaFree(d_c);\n", + "\n", + " std::vector> result(rows, std::vector(cols));\n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " result[i][j] = h_c[i*cols + j];\n", + " }\n", + " }\n", + "\n", + " delete[] h_a;\n", + " delete[] h_b;\n", + " delete[] h_c;\n", + "\n", + " return result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b70f7f32-b7c2-4a1c-96a0-aea42b15f2aa", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> matsubCUDA(const std::vector>& a, const std::vector>& b) {\n", + " int rows = a.size();\n", + " int cols = a[0].size();\n", + " \n", + " double* h_a = new double[rows*cols];\n", + " double* h_b = new double[rows*cols];\n", + " double* h_c = new double[rows*cols];\n", + " \n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " h_a[i*cols + j] = a[i][j];\n", + " h_b[i*cols + j] = b[i][j];\n", + " }\n", + " }\n", + "\n", + " double* d_a, * d_b, * d_c;\n", + " cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n", + " cudaMalloc((void**)&d_b, rows*cols*sizeof(double));\n", + " cudaMalloc((void**)&d_c, rows*cols*sizeof(double));\n", + "\n", + " cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n", + " cudaMemcpy(d_b, h_b, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " dim3 dimBlock(16, 16); \n", + " dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n", + " matSubKernel<<>>(d_a, d_b, d_c, rows, cols);\n", + "\n", + " cudaMemcpy(h_c, d_c, rows*cols*sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_a);\n", + " cudaFree(d_b);\n", + " cudaFree(d_c);\n", + "\n", + " std::vector> result(rows, std::vector(cols));\n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " result[i][j] = h_c[i*cols + j];\n", + " }\n", + " }\n", + "\n", + " delete[] h_a;\n", + " delete[] h_b;\n", + " delete[] h_c;\n", + "\n", + " return result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "86b4a452-f04c-45a4-b69b-d87bcb2e5954", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> mattransposeCUDA(const std::vector>& a) {\n", + " int rows = a.size();\n", + " int cols = a[0].size();\n", + " \n", + " double* h_a = new double[rows*cols];\n", + " double* h_c = new double[cols*rows];\n", + " \n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " h_a[i*cols + j] = a[i][j];\n", + " }\n", + " }\n", + "\n", + " double* d_a, * d_c;\n", + " cudaMalloc((void**)&d_a, rows*cols*sizeof(double));\n", + " cudaMalloc((void**)&d_c, cols*rows*sizeof(double));\n", + "\n", + " cudaMemcpy(d_a, h_a, rows*cols*sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " dim3 dimBlock(16, 16);\n", + " dim3 dimGrid((cols + dimBlock.x - 1) / dimBlock.x, (rows + dimBlock.y - 1) / dimBlock.y);\n", + " matTransposeKernel<<>>(d_a, d_c, rows, cols);\n", + "\n", + " cudaMemcpy(h_c, d_c, cols*rows*sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_a);\n", + " cudaFree(d_c);\n", + "\n", + " std::vector> result(cols, std::vector(rows));\n", + " for (int i = 0; i < cols; i++) {\n", + " for (int j = 0; j < rows; j++) {\n", + " result[i][j] = h_c[i*rows + j];\n", + " }\n", + " }\n", + "\n", + " delete[] h_a;\n", + " delete[] h_c;\n", + "\n", + " return result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0d925dce-377c-4d23-96d6-9c4fead297cb", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector matvecmulCUDA(const std::vector>& a, const std::vector& b) {\n", + " int rows = a.size();\n", + " int cols = a[0].size();\n", + "\n", + " if (cols != b.size()) {\n", + " throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n", + " }\n", + "\n", + " std::vector h_mat(rows * cols);\n", + " for (int i = 0; i < rows; i++) {\n", + " for (int j = 0; j < cols; j++) {\n", + " h_mat[i * cols + j] = a[i][j];\n", + " }\n", + " }\n", + "\n", + " double *d_mat, *d_vec, *d_result;\n", + " cudaMalloc((void**)&d_mat, rows * cols * sizeof(double));\n", + " cudaMalloc((void**)&d_vec, cols * sizeof(double));\n", + " cudaMalloc((void**)&d_result, rows * sizeof(double));\n", + "\n", + " cudaMemcpy(d_mat, h_mat.data(), rows * cols * sizeof(double), cudaMemcpyHostToDevice);\n", + " cudaMemcpy(d_vec, b.data(), cols * sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " int blockSize = 256;\n", + " int gridSize = (rows + blockSize - 1) / blockSize;\n", + "\n", + " matvecmulKernel<<>>(d_mat, d_vec, d_result, rows, cols);\n", + "\n", + " std::vector result(rows);\n", + " cudaMemcpy(result.data(), d_result, rows * sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_mat);\n", + " cudaFree(d_vec);\n", + " cudaFree(d_result);\n", + "\n", + " return result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "69714460-9a3e-4966-9de7-5e4111fc40af", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> matmulCUDA(const std::vector>& a, const std::vector>& b) {\n", + " int rowsA = a.size();\n", + " int colsA = a[0].size();\n", + " int rowsB = b.size();\n", + " int colsB = b[0].size();\n", + "\n", + " if (colsA != rowsB) {\n", + " throw std::runtime_error(\"Matrix dims do not match for multiplication.\");\n", + " }\n", + "\n", + " std::vector h_a(rowsA * colsA);\n", + " std::vector h_b(rowsB * colsB);\n", + "\n", + " for (int i = 0; i < rowsA; i++) {\n", + " for (int j = 0; j < colsA; j++) {\n", + " h_a[i * colsA + j] = a[i][j];\n", + " }\n", + " }\n", + "\n", + " for (int i = 0; i < rowsB; i++) {\n", + " for (int j = 0; j < colsB; j++) {\n", + " h_b[i * colsB + j] = b[i][j];\n", + " }\n", + " }\n", + "\n", + " double *d_a, *d_b, *d_result;\n", + " \n", + " cudaMalloc((void**)&d_a, rowsA * colsA * sizeof(double));\n", + " cudaMalloc((void**)&d_b, rowsB * colsB * sizeof(double));\n", + " cudaMalloc((void**)&d_result, rowsA * colsB * sizeof(double));\n", + "\n", + " cudaMemcpy(d_a, h_a.data(), rowsA * colsA * sizeof(double), cudaMemcpyHostToDevice);\n", + " cudaMemcpy(d_b, h_b.data(), rowsB * colsB * sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " dim3 blockSize(16, 16);\n", + " dim3 gridSize((colsB + blockSize.x - 1) / blockSize.x, (rowsA + blockSize.y - 1) / blockSize.y);\n", + "\n", + " matmulKernel<<>>(d_a, d_b, d_result, rowsA, colsA, colsB);\n", + "\n", + " std::vector h_result(rowsA * colsB);\n", + " cudaMemcpy(h_result.data(), d_result, rowsA * colsB * sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_a);\n", + " cudaFree(d_b);\n", + " cudaFree(d_result);\n", + " \n", + " std::vector> result(rowsA, std::vector(colsB));\n", + " for (int i = 0; i < rowsA; i++) {\n", + " for (int j = 0; j < colsB; j++) {\n", + " result[i][j] = h_result[i * colsB + j];\n", + " }\n", + " }\n", + "\n", + " return result;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7295e929-dcfc-43be-9b3b-82e9f9fbf9db", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector vecsubCUDA(const std::vector& a, const std::vector& b) {\n", + " int len = a.size();\n", + "\n", + " double* d_a;\n", + " double* d_b;\n", + " double* d_result;\n", + "\n", + " cudaMalloc((void**)&d_a, len * sizeof(double));\n", + " cudaMalloc((void**)&d_b, len * sizeof(double));\n", + " cudaMalloc((void**)&d_result, len * sizeof(double));\n", + "\n", + " cudaMemcpy(d_a, a.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n", + " cudaMemcpy(d_b, b.data(), len * sizeof(double), cudaMemcpyHostToDevice);\n", + "\n", + " int blockSize = 256; \n", + " int gridSize = (len + blockSize - 1) / blockSize;\n", + "\n", + " vecsubKernel<<>>(d_a, d_b, d_result, len);\n", + "\n", + " std::vector result(len);\n", + " cudaMemcpy(result.data(), d_result, len * sizeof(double), cudaMemcpyDeviceToHost);\n", + "\n", + " cudaFree(d_a);\n", + " cudaFree(d_b);\n", + " cudaFree(d_result);\n", + "\n", + " return result;\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "87d7dcc2-bac9-4ca4-aefc-f2808d8e13ce", + "metadata": {}, + "source": [ + "### Function to obtain 2x2 and 3x3 matrix inverse using adjoint:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "cb968a24-75c4-491f-9930-0f16a48f7560", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> matinverse(const std::vector>& a) {\n", + " size_t n = a.size();\n", + " \n", + " if (n != a[0].size()) {\n", + " std::cout<<\" Shape of a : \"<> result(2, std::vector(2));\n", + " result[0][0] = a[1][1] / determinant;\n", + " result[0][1] = -a[0][1] / determinant;\n", + " result[1][0] = -a[1][0] / determinant;\n", + " result[1][1] = a[0][0] / determinant;\n", + "\n", + " return result;\n", + " }\n", + "\n", + " if (n == 3) {\n", + " double determinant = a[0][0]*(a[1][1]*a[2][2]-a[2][1]*a[1][2]) \n", + " - a[0][1]*(a[1][0]*a[2][2]-a[1][2]*a[2][0]) \n", + " + a[0][2]*(a[1][0]*a[2][1]-a[1][1]*a[2][0]);\n", + " \n", + " if (determinant == 0) {\n", + " throw std::runtime_error(\"singular matrix\");\n", + " }\n", + "\n", + " std::vector> result(3, std::vector(3));\n", + "\n", + " result[0][0] = (a[1][1] * a[2][2] - a[2][1] * a[1][2]) / determinant;\n", + " result[0][1] = (a[0][2] * a[2][1] - a[0][1] * a[2][2]) / determinant;\n", + " result[0][2] = (a[0][1] * a[1][2] - a[0][2] * a[1][1]) / determinant;\n", + " result[1][0] = (a[1][2] * a[2][0] - a[1][0] * a[2][2]) / determinant;\n", + " result[1][1] = (a[0][0] * a[2][2] - a[0][2] * a[2][0]) / determinant;\n", + " result[1][2] = (a[1][0] * a[0][2] - a[0][0] * a[1][2]) / determinant;\n", + " result[2][0] = (a[1][0] * a[2][1] - a[2][0] * a[1][1]) / determinant;\n", + " result[2][1] = (a[2][0] * a[0][1] - a[0][0] * a[2][1]) / determinant;\n", + " result[2][2] = (a[0][0] * a[1][1] - a[1][0] * a[0][1]) / determinant;\n", + "\n", + " return result;\n", + " }\n", + "\n", + " throw std::runtime_error(\"Only 2x2 and 3x3 matrices supported for inversion\");\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "c82451a0-3439-4b31-930d-9637db19f729", + "metadata": {}, + "source": [ + "### Overloaded functions to print both matrices and vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "84f29c1e-d4c2-4680-a579-14d63a2bde08", + "metadata": {}, + "outputs": [], + "source": [ + "void printMatrix(const std::vector>& matrix) {\n", + " for (size_t i = 0; i < matrix.size(); i++) {\n", + " for (size_t j = 0; j < matrix[i].size(); j++) {\n", + " std::cout << matrix[i][j] << \" \";\n", + " }\n", + " std::cout << std::endl;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fa143e89-d1c5-4b54-b22d-9921d7e1000d", + "metadata": {}, + "outputs": [], + "source": [ + "void printMatrix(const std::vector& vec) {\n", + " for (size_t i = 0; i < vec.size(); i++) {\n", + " std::cout << vec[i] << \" \";\n", + " }\n", + " std::cout << std::endl;\n", + " }\n" + ] + }, + { + "cell_type": "markdown", + "id": "79ebaf27-693f-4213-800d-8e8b5c9abaa3", + "metadata": {}, + "source": [ + "### Test for matrix-matrix multiplication using CUDA" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "6d718c47-3bc6-4c7a-893c-4fedc399926d", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> matrixA = {\n", + " {1.0, 2.0},\n", + " {3.0, 4.0},\n", + " {5.0, 6.0}\n", + " };\n", + "\n", + "std::vector> matrixB = {\n", + " {1.0, 2.0, 3.0, 4.0},\n", + " {5.0, 6.0, 7.0, 8.0}\n", + " };\n", + "\n", + "std::vector> result = matmulCUDA(matrixA, matrixB);" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "3e1c925e-c164-46af-8e0f-65c05f0ee1c0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Result matrix:\n", + "11 14 17 20 \n", + "23 30 37 44 \n", + "35 46 57 68 \n" + ] + } + ], + "source": [ + "std::cout << \"Result matrix:\" << std::endl;\n", + "printMatrix(result);" + ] + }, + { + "cell_type": "markdown", + "id": "9312a581-d000-4da6-b257-b22a7da942f6", + "metadata": {}, + "source": [ + "## Constructor to initialise an object of the Kalman Filter class" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "f4c9238f-674a-4cf8-8128-840cc7f61e0f", + "metadata": {}, + "outputs": [], + "source": [ + "KalmanFilter::KalmanFilter(\n", + " double dt,\n", + " const std::vector>& A,\n", + " const std::vector>& C,\n", + " const std::vector>& Q,\n", + " const std::vector>& R,\n", + " const std::vector>& P)\n", + " : A(A), C(C), Q(Q), R(R), P0(P),\n", + " m(C.size()), n(A.size()), dt(dt), initialized(false),\n", + " I(n, std::vector(n)), x_hat(n), x_hat_new(n)\n", + "{\n", + " for (int i = 0; i < n; i++) {\n", + " I[i][i] = 1.0;\n", + " }\n", + "}\n", + "\n", + "KalmanFilter::KalmanFilter() {}" + ] + }, + { + "cell_type": "markdown", + "id": "cb0ecf07-b676-4a2f-bb91-4d0ebfe4fc17", + "metadata": {}, + "source": [ + "### Describe the `KalmanFilter::init` function to initialize the filter with initial states as zero." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ed99dd28-5959-409a-8226-7e292bc4b0de", + "metadata": {}, + "outputs": [], + "source": [ + "void KalmanFilter::init() {\n", + " std::fill(x_hat.begin(), x_hat.end(), 0.0);\n", + " P = P0;\n", + " t0 = 0;\n", + " t = t0;\n", + " initialized = true;\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "f4291823-8a85-40bf-baf7-c0686388e435", + "metadata": {}, + "source": [ + "### Describe the `KalmanFilter::init` function to initialize the filter with a guess for initial states." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "bd953ec6-ceed-489c-8e33-8eb7ae6bb32f", + "metadata": {}, + "outputs": [], + "source": [ + "void KalmanFilter::init(double t0, const std::vector& x0) {\n", + " x_hat = x0;\n", + " P = P0;\n", + " this->t0 = t0;\n", + " t = t0;\n", + " initialized = true;\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "3b71922a-0448-4fa0-8196-08142c2f64b8", + "metadata": {}, + "source": [ + "### Defining a function that executes the two stages of the ongoing discrete Kalman filter cycle" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ea057f20-9097-4dd3-946e-ab7a383afe69", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector KalmanFilter::update(const std::vector& y) {\n", + " if (!initialized)\n", + " throw std::runtime_error(\"Filter is not initialized!\");\n", + " \n", + " // Discrete Kalman filter time update \n", + " x_hat_new = matvecmulCUDA(A, x_hat);\n", + " P = mataddCUDA(matmulCUDA(matmulCUDA(A, P), mattransposeCUDA(A)), Q);\n", + " \n", + " // Discrete Kalman filter measurement update\n", + " std::vector> inv = matinverse(mataddCUDA(matmulCUDA(matmulCUDA(C, P), mattransposeCUDA(C)), R));\n", + " K = matmulCUDA(matmulCUDA(P, mattransposeCUDA(C)), inv);\n", + " std::vector temp = matvecmulCUDA(C, x_hat_new);\n", + " std::vector difference = vecsubCUDA(y, temp);\n", + " std::vector gain = K[0];\n", + " for (size_t i = 0; i < x_hat_new.size(); i++) {\n", + " x_hat_new[i] += matvecmulCUDA(K, difference)[i];\n", + " }\n", + "\n", + " P = matmulCUDA(matsubCUDA(I, matmulCUDA(K, C)), P);\n", + "\n", + " x_hat = x_hat_new;\n", + " t += dt;\n", + " \n", + " return gain;\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b1d8963-599a-4034-80de-e7c92c59c321", + "metadata": {}, + "source": [ + "### Update the estimated state based on measured values, using the given time step and dynamics matrix:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "08d28163-a086-46a9-bf41-044f48530929", + "metadata": {}, + "outputs": [], + "source": [ + "void KalmanFilter::update(const std::vector& y, double dt, const std::vector>& A) {\n", + " this->A = A;\n", + " this->dt = dt;\n", + " update(y);\n", + "}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7c31cf57-c669-43af-8aba-0b03c08b575d", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector measurements;" + ] + }, + { + "cell_type": "markdown", + "id": "c93b8e0e-1b66-4d26-98fb-ec7a4e75459b", + "metadata": {}, + "source": [ + "### The cross language support allows us to read the measurements from the python side and initialise the std::vector to be passed to the Kalman Filter\n", + "\n", + "#### setData here recieves the python list as a `cppyy.gbl.std.vector` and passes it to `std::vector measurements` " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "caa856ab-e4e1-4d88-984d-058fb21da79f", + "metadata": {}, + "outputs": [], + "source": [ + "void setData(const std::vector& input) {\n", + " measurements = input;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "ee23b991-3495-4ef5-94c0-b906867d4544", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: matplotlib in /opt/conda/envs/.venv/lib/python3.10/site-packages (3.8.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.1.1)\n", + "Requirement already satisfied: cycler>=0.10 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (0.11.0)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (4.42.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.4.5)\n", + "Requirement already satisfied: numpy<2,>=1.21 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (1.26.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (23.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (10.0.1)\n", + "Requirement already satisfied: pyparsing>=2.3.1 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (3.1.1)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from matplotlib) (2.8.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/envs/.venv/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)\n" + ] + } + ], + "source": [ + "!pip install matplotlib\n", + "!pip install PyYaml" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "c35e13e9-4f7f-4cb7-abd7-03362ddd6848", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import yaml\n", + "import cppyy\n", + "\n", + "with open('data/measurements.yml', 'r') as file:\n", + " data_dict = yaml.safe_load(file)\n", + " data_list = list(float(x) for x in data_dict['data'])\n", + "\n", + "measurements_vector = cppyy.gbl.std.vector['double'](data_list)" + ] + }, + { + "cell_type": "markdown", + "id": "1401cbcd-42e1-415a-a866-43a3325b7986", + "metadata": {}, + "source": [ + "### Lets verify the data integrity by plotting a position vs time graph" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "4b203531-45e5-409b-82ec-86582309cfd9", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "dt = 1/30\n", + " \n", + "# Generating time points for the x-axis based on dt\n", + "time_points = [i*dt for i in range(len(data_list))]\n", + "\n", + "# Plotting\n", + "plt.plot(time_points, data_list, '-o', label='Position')\n", + "plt.xlabel('Time (s)')\n", + "plt.ylabel('Position (x)')\n", + "plt.title('Position vs. Time')\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.savefig(\"position_graph.png\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "88774a21-8d16-4abc-ae71-3f81328acf08", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "f9188118-ab05-450d-a6e9-9e0fac78bb50", + "metadata": {}, + "source": [ + "### Now we pass it to the C++ side by calling `setData`" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "bc5c3165-5cc3-4017-9e5f-13f61388f727", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "cppyy.gbl.setData(measurements_vector)" + ] + }, + { + "cell_type": "markdown", + "id": "018496cc-7f4d-499e-a982-00610d7f9651", + "metadata": {}, + "source": [ + "### Putting it all together:\n", + "We define the `run_kf` function that initialises the following:\n", + "- Number of states\n", + "- Dimensions of each measurement\n", + "- Time steps used $\\begin{aligned} & {dt}\\end{aligned}$\n", + "- Covariance matrices\n", + "- Passes the measurement data to $\\begin{aligned} & \\hat{x}_0^{-}\\end{aligned}$\n", + "\n", + "Then the measurements are fed into filter, and output estimated states\n", + "\n", + "The algorithm then runs by looping over the measurements:\n", + "\n", + "$\\begin{gathered} & for\\:y\\:in\\:measurements\\::\\end{gathered}$\n", + "$\\begin{gathered} & kf.update(y)\\end{gathered}$" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "8938a288-fc2c-4104-82c6-added4466c8c", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector> run_kf(bool verbose) {\n", + " \n", + " int n = 3; \n", + " int m = 1; \n", + "\n", + " double dt = 1.0 / 30; // Time step\n", + " \n", + " std::vector g_preds;\n", + " \n", + " std::vector> A(n, std::vector(n));\n", + " std::vector> C(m, std::vector(n));\n", + " std::vector> Q(n, std::vector(n));\n", + " std::vector> R(m, std::vector(m));\n", + " std::vector> P(n, std::vector(n));\n", + " \n", + " A = {{1, dt, 0}, {0, 1, dt}, {0, 0, 1}};\n", + " C = {{1, 0, 0}};\n", + " Q = {{.05, .05, .0}, {.05, .05, .0}, {.0, .0, .0}};\n", + " R = {{5}};\n", + " P = {{.1, .1, .1}, {.1, 10000, 10}, {.1, 10, 100}};\n", + " \n", + " KalmanFilter kf(dt, A, C, Q, R, P);\n", + " \n", + " std::vector x0 = {measurements[0], 0, -15};\n", + " std::vector> gain;\n", + " kf.init(0, x0);\n", + "\n", + " \n", + " std::vector y(m);\n", + " if(verbose) {\n", + " std::cout << \"t = \" << 0 << \", \" << \"x_hat[0]: \";\n", + " for (auto& val : kf.state()){\n", + " std::cout << val << \" \";\n", + " }\n", + " std::cout << std::endl;\n", + " }\n", + " \n", + " int i;\n", + " for (i = 0; i < measurements.size(); i++) {\n", + " y[0] = measurements[i];\n", + " gain.push_back(kf.update(y));\n", + " if(verbose) {\n", + " std::cout << \"t = \" << (i + 1) * dt << \", y[\" << i << \"] = \" << y[0] << \", x_hat[\" << i << \"] = \";\n", + " for (auto& val : kf.state()) {\n", + " std::cout << val << \" \";\n", + " }\n", + " g_preds.push_back(kf.state()[2]);\n", + " std::cout << std::endl;\n", + " }\n", + " }\n", + " std::cout << std::endl;\n", + " std::cout<<\"Exec Success, Final kf states:\";\n", + " for (auto& val : kf.state()) std::cout << val << \" \";\n", + " std::cout << std::endl;\n", + "\n", + " std::vector> g_res;\n", + " for (size_t i = 0; i < g_preds.size(); ++i) {\n", + " std::vector pair = {g_preds[i], gain[i][0]};\n", + " g_res.push_back(pair);\n", + " }\n", + " \n", + " return g_res;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "7a910405-c4ff-41fa-aec5-69aa24598f9c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "t = 0, x_hat[0]: 1.04203 0 -15 \n", + "t = 0.0333333, y[0] = 1.04203, x_hat[0] = 1.04203 -0.5 -15 \n", + "t = 0.0666667, y[1] = 1.10727, x_hat[1] = 1.08556 -0.0966619 -14.9988 \n", + "t = 0.1, y[2] = 1.29135, x_hat[2] = 1.21317 0.720024 -14.9952 \n", + "t = 0.133333, y[3] = 1.48485, x_hat[3] = 1.36865 1.21707 -14.9881 \n", + "t = 0.166667, y[4] = 1.72826, x_hat[4] = 1.55548 1.60875 -14.9732 \n", + "t = 0.2, y[5] = 1.74216, x_hat[5] = 1.66278 1.38374 -14.9637 \n", + "t = 0.233333, y[6] = 2.11672, x_hat[6] = 1.85606 1.53382 -14.9229 \n", + "t = 0.266667, y[7] = 2.14529, x_hat[7] = 1.98512 1.34018 -14.8908 \n", + "t = 0.3, y[8] = 2.1603, x_hat[8] = 2.06901 0.981678 -14.8682 \n", + "t = 0.333333, y[9] = 2.21269, x_hat[9] = 2.13267 0.58593 -14.8441 \n", + "t = 0.366667, y[10] = 2.57709, x_hat[10] = 2.26319 0.425535 -14.7321 \n", + "t = 0.4, y[11] = 2.66822, x_hat[11] = 2.37389 0.210228 -14.6096 \n", + "t = 0.433333, y[12] = 2.51642, x_hat[12] = 2.41279 -0.189016 -14.56 \n", + "t = 0.466667, y[13] = 2.76034, x_hat[13] = 2.4865 -0.459522 -14.4115 \n", + "t = 0.5, y[14] = 2.88132, x_hat[14] = 2.56093 -0.701772 -14.2172 \n", + "t = 0.533333, y[15] = 2.88374, x_hat[15] = 2.61135 -0.980106 -14.0346 \n", + "t = 0.566667, y[16] = 2.94485, x_hat[16] = 2.65526 -1.24377 -13.8225 \n", + "t = 0.6, y[17] = 2.82867, x_hat[17] = 2.65812 -1.58494 -13.6876 \n", + "t = 0.633333, y[18] = 3.00066, x_hat[18] = 2.68613 -1.81973 -13.4217 \n", + "t = 0.666667, y[19] = 3.12921, x_hat[19] = 2.72799 -1.9816 -13.0631 \n", + "t = 0.7, y[20] = 2.85836, x_hat[20] = 2.70186 -2.30405 -12.9167 \n", + "t = 0.733333, y[21] = 2.83808, x_hat[21] = 2.66841 -2.61016 -12.7523 \n", + "t = 0.766667, y[22] = 2.68975, x_hat[22] = 2.60351 -2.971 -12.6666 \n", + "t = 0.8, y[23] = 2.66533, x_hat[23] = 2.53742 -3.29663 -12.5373 \n", + "t = 0.833333, y[24] = 2.81613, x_hat[24] = 2.50742 -3.47892 -12.2229 \n", + "t = 0.866667, y[25] = 2.81004, x_hat[25] = 2.47782 -3.63091 -11.8847 \n", + "t = 0.9, y[26] = 2.88322, x_hat[26] = 2.46572 -3.70491 -11.4628 \n", + "t = 0.933333, y[27] = 2.69789, x_hat[27] = 2.41597 -3.86954 -11.1819 \n", + "t = 0.966667, y[28] = 2.43422, x_hat[28] = 2.31755 -4.15264 -11.068 \n", + "t = 1, y[29] = 2.23465, x_hat[29] = 2.19065 -4.48803 -11.0261 \n", + "t = 1.03333, y[30] = 2.30279, x_hat[30] = 2.09529 -4.69914 -10.8344 \n", + "t = 1.06667, y[31] = 2.0207, x_hat[31] = 1.95562 -5.01191 -10.7763 \n", + "t = 1.1, y[32] = 1.94394, x_hat[32] = 1.82057 -5.28093 -10.6701 \n", + "t = 1.13333, y[33] = 1.82498, x_hat[33] = 1.68155 -5.53372 -10.5515 \n", + "t = 1.16667, y[34] = 1.52526, x_hat[34] = 1.50284 -5.86969 -10.5338 \n", + "t = 1.2, y[35] = 1.86968, x_hat[35] = 1.42124 -5.91291 -10.1937 \n", + "t = 1.23333, y[36] = 1.18073, x_hat[36] = 1.2154 -6.27594 -10.2188 \n", + "t = 1.26667, y[37] = 1.1073, x_hat[37] = 1.02642 -6.56369 -10.1629 \n", + "t = 1.3, y[38] = 0.916168, x_hat[38] = 0.829169 -6.84703 -10.1055 \n", + "t = 1.33333, y[39] = 0.678548, x_hat[39] = 0.616214 -7.14521 -10.0663 \n", + "t = 1.36667, y[40] = 0.562382, x_hat[40] = 0.41403 -7.39117 -9.97735 \n", + "t = 1.4, y[41] = 0.355468, x_hat[41] = 0.204015 -7.63475 -9.89085 \n", + "t = 1.43333, y[42] = -0.155607, x_hat[42] = -0.0706529 -8.01301 -9.93708 \n", + "t = 1.46667, y[43] = -0.287199, x_hat[43] = -0.328135 -8.32148 -9.91585 \n", + "t = 1.5, y[44] = -0.602973, x_hat[44] = -0.605038 -8.65089 -9.91483 \n", + "t = 1.53333, y[45] = -0.292649, x_hat[45] = -0.781097 -8.72417 -9.68453 \n", + "t = 1.56667, y[46] = -0.602973, x_hat[46] = -0.985005 -8.85119 -9.51267 \n", + "t = 1.6, y[47] = -0.924198, x_hat[47] = -1.21467 -9.02335 -9.38793 \n", + "t = 1.63333, y[48] = -1.25632, x_hat[48] = -1.46826 -9.2333 -9.30101 \n", + "t = 1.66667, y[49] = -1.59935, x_hat[49] = -1.74413 -9.4748 -9.24427 \n", + "t = 1.7, y[50] = -1.95327, x_hat[50] = -2.04085 -9.74255 -9.21144 \n", + "t = 1.73333, y[51] = -2.3181, x_hat[51] = -2.35716 -10.032 -9.19743 \n", + "t = 1.76667, y[52] = -2.69382, x_hat[52] = -2.69196 -10.3394 -9.19807 \n", + "t = 1.8, y[53] = -3.08044, x_hat[53] = -3.04427 -10.6615 -9.20997 \n", + "t = 1.83333, y[54] = -3.47797, x_hat[54] = -3.41323 -10.9956 -9.23039 \n", + "t = 1.86667, y[55] = -3.88639, x_hat[55] = -3.7981 -11.3393 -9.25711 \n", + "t = 1.9, y[56] = -4.30572, x_hat[56] = -4.19822 -11.6907 -9.28834 \n", + "t = 1.93333, y[57] = -4.73594, x_hat[57] = -4.61301 -12.0482 -9.32265 \n", + "t = 1.96667, y[58] = -5.17707, x_hat[58] = -5.04196 -12.4105 -9.3589 \n", + "t = 2, y[59] = -5.62909, x_hat[59] = -5.48464 -12.7763 -9.39618 \n", + "t = 2.03333, y[60] = -6.09202, x_hat[60] = -5.94065 -13.1448 -9.43379 \n", + "t = 2.06667, y[61] = -6.56584, x_hat[61] = -6.40966 -13.5151 -9.47117 \n", + "t = 2.1, y[62] = -7.05057, x_hat[62] = -6.89136 -13.8865 -9.5079 \n", + "t = 2.13333, y[63] = -7.54619, x_hat[63] = -7.38549 -14.2587 -9.54365 \n", + "t = 2.16667, y[64] = -8.05271, x_hat[64] = -7.89184 -14.6309 -9.57819 \n", + "t = 2.2, y[65] = -8.57014, x_hat[65] = -8.41019 -15.003 -9.61135 \n", + "t = 2.23333, y[66] = -9.09846, x_hat[66] = -8.94038 -15.3747 -9.64301 \n", + "t = 2.26667, y[67] = -9.63769, x_hat[67] = -9.48225 -15.7456 -9.67311 \n", + "t = 2.3, y[68] = -10.1878, x_hat[68] = -10.0357 -16.1156 -9.70161 \n", + "t = 2.33333, y[69] = -10.7488, x_hat[69] = -10.6005 -16.4845 -9.7285 \n", + "t = 2.36667, y[70] = -11.3208, x_hat[70] = -11.1767 -16.8523 -9.75379 \n", + "t = 2.4, y[71] = -11.9036, x_hat[71] = -11.7642 -17.2188 -9.77751 \n", + "t = 2.43333, y[72] = -12.4973, x_hat[72] = -12.3628 -17.5841 -9.7997 \n", + "t = 2.46667, y[73] = -13.1019, x_hat[73] = -12.9725 -17.9479 -9.82041 \n", + "t = 2.5, y[74] = -13.7175, x_hat[74] = -13.5932 -18.3104 -9.83969 \n", + "t = 2.53333, y[75] = -14.3439, x_hat[75] = -14.225 -18.6716 -9.8576 \n", + "t = 2.56667, y[76] = -14.9812, x_hat[76] = -14.8677 -19.0313 -9.87422 \n", + "t = 2.6, y[77] = -15.6294, x_hat[77] = -15.5214 -19.3897 -9.88959 \n", + "t = 2.63333, y[78] = -16.2886, x_hat[78] = -16.1859 -19.7467 -9.90379 \n", + "t = 2.66667, y[79] = -16.9586, x_hat[79] = -16.8613 -20.1025 -9.91688 \n", + "t = 2.7, y[80] = -17.6395, x_hat[80] = -17.5475 -20.4569 -9.92892 \n", + "t = 2.73333, y[81] = -18.3313, x_hat[81] = -18.2446 -20.8101 -9.93998 \n", + "t = 2.76667, y[82] = -19.0341, x_hat[82] = -18.9524 -21.162 -9.9501 \n", + "t = 2.8, y[83] = -19.7477, x_hat[83] = -19.6711 -21.5128 -9.95936 \n", + "t = 2.83333, y[84] = -20.4722, x_hat[84] = -20.4006 -21.8624 -9.96781 \n", + "t = 2.86667, y[85] = -21.2076, x_hat[85] = -21.1408 -22.2109 -9.97549 \n", + "t = 2.9, y[86] = -21.954, x_hat[86] = -21.8918 -22.5584 -9.98246 \n", + "t = 2.93333, y[87] = -22.7112, x_hat[87] = -22.6535 -22.9048 -9.98876 \n", + "t = 2.96667, y[88] = -23.4793, x_hat[88] = -23.4261 -23.2503 -9.99445 \n", + "t = 3, y[89] = -24.2583, x_hat[89] = -24.2094 -23.5948 -9.99955 \n", + "t = 3.03333, y[90] = -25.0483, x_hat[90] = -25.0034 -23.9384 -10.0041 \n", + "t = 3.06667, y[91] = -25.8491, x_hat[91] = -25.8082 -24.2812 -10.0082 \n", + "t = 3.1, y[92] = -26.6608, x_hat[92] = -26.6238 -24.6231 -10.0118 \n", + "t = 3.13333, y[93] = -27.4834, x_hat[93] = -27.4501 -24.9642 -10.015 \n", + "t = 3.16667, y[94] = -28.3169, x_hat[94] = -28.2872 -25.3046 -10.0177 \n", + "t = 3.2, y[95] = -29.1614, x_hat[95] = -29.135 -25.6443 -10.0201 \n", + "t = 3.23333, y[96] = -30.0167, x_hat[96] = -29.9937 -25.9833 -10.0222 \n", + "t = 3.26667, y[97] = -30.8829, x_hat[97] = -30.863 -26.3216 -10.0239 \n", + "t = 3.3, y[98] = -31.76, x_hat[98] = -31.7432 -26.6593 -10.0254 \n", + "t = 3.33333, y[99] = -32.6481, x_hat[99] = -32.6341 -26.9964 -10.0265 \n", + "t = 3.36667, y[100] = -33.547, x_hat[100] = -33.5358 -27.333 -10.0274 \n", + "t = 3.4, y[101] = -34.4568, x_hat[101] = -34.4483 -27.669 -10.0281 \n", + "t = 3.43333, y[102] = -35.3775, x_hat[102] = -35.3716 -28.0045 -10.0286 \n", + "t = 3.46667, y[103] = -36.3092, x_hat[103] = -36.3056 -28.3395 -10.0289 \n", + "t = 3.5, y[104] = -37.2517, x_hat[104] = -37.2505 -28.674 -10.029 \n", + "t = 3.53333, y[105] = -38.2051, x_hat[105] = -38.2061 -29.0081 -10.0289 \n", + "t = 3.56667, y[106] = -39.1694, x_hat[106] = -39.1726 -29.3418 -10.0287 \n", + "t = 3.6, y[107] = -40.1447, x_hat[107] = -40.1498 -29.6751 -10.0283 \n", + "t = 3.63333, y[108] = -41.1308, x_hat[108] = -41.1378 -30.008 -10.0278 \n", + "t = 3.66667, y[109] = -42.1278, x_hat[109] = -42.1367 -30.3405 -10.0272 \n", + "t = 3.7, y[110] = -43.1357, x_hat[110] = -43.1463 -30.6727 -10.0265 \n", + "t = 3.73333, y[111] = -44.1546, x_hat[111] = -44.1668 -31.0046 -10.0257 \n", + "t = 3.76667, y[112] = -45.1843, x_hat[112] = -45.1981 -31.3362 -10.0247 \n", + "t = 3.8, y[113] = -46.2249, x_hat[113] = -46.2402 -31.6674 -10.0238 \n", + "t = 3.83333, y[114] = -47.2764, x_hat[114] = -47.2932 -31.9984 -10.0227 \n", + "t = 3.86667, y[115] = -48.3389, x_hat[115] = -48.3569 -32.3292 -10.0216 \n", + "t = 3.9, y[116] = -49.4122, x_hat[116] = -49.4315 -32.6597 -10.0204 \n", + "t = 3.93333, y[117] = -50.4964, x_hat[117] = -50.517 -32.9899 -10.0191 \n", + "t = 3.96667, y[118] = -51.5915, x_hat[118] = -51.6132 -33.3199 -10.0179 \n", + "t = 4, y[119] = -52.6976, x_hat[119] = -52.7204 -33.6497 -10.0165 \n", + "t = 4.03333, y[120] = -53.8145, x_hat[120] = -53.8383 -33.9793 -10.0152 \n", + "t = 4.06667, y[121] = -54.9423, x_hat[121] = -54.9671 -34.3087 -10.0138 \n", + "t = 4.1, y[122] = -56.081, x_hat[122] = -56.1067 -34.638 -10.0123 \n", + "t = 4.13333, y[123] = -57.2307, x_hat[123] = -57.2572 -34.967 -10.0109 \n", + "t = 4.16667, y[124] = -58.3912, x_hat[124] = -58.4185 -35.2959 -10.0094 \n", + "t = 4.2, y[125] = -59.5626, x_hat[125] = -59.5907 -35.6246 -10.0079 \n", + "t = 4.23333, y[126] = -60.7449, x_hat[126] = -60.7738 -35.9532 -10.0064 \n", + "t = 4.26667, y[127] = -61.9382, x_hat[127] = -61.9677 -36.2817 -10.0048 \n", + "t = 4.3, y[128] = -63.1423, x_hat[128] = -63.1724 -36.61 -10.0033 \n", + "t = 4.33333, y[129] = -64.3573, x_hat[129] = -64.388 -36.9381 -10.0018 \n", + "t = 4.36667, y[130] = -65.5832, x_hat[130] = -65.6145 -37.2662 -10.0002 \n", + "t = 4.4, y[131] = -66.8201, x_hat[131] = -66.8518 -37.5942 -9.99866 \n", + "t = 4.43333, y[132] = -68.0678, x_hat[132] = -68.1 -37.922 -9.9971 \n", + "t = 4.46667, y[133] = -69.3264, x_hat[133] = -69.3591 -38.2498 -9.99554 \n", + "t = 4.5, y[134] = -70.5959, x_hat[134] = -70.629 -38.5774 -9.99399 \n", + "t = 4.53333, y[135] = -71.8764, x_hat[135] = -71.9099 -38.905 -9.99244 \n", + "t = 4.56667, y[136] = -73.1677, x_hat[136] = -73.2015 -39.2324 -9.99089 \n", + "t = 4.6, y[137] = -74.4699, x_hat[137] = -74.5041 -39.5598 -9.98934 \n", + "t = 4.63333, y[138] = -75.783, x_hat[138] = -75.8175 -39.8871 -9.98781 \n", + "t = 4.66667, y[139] = -77.1071, x_hat[139] = -77.1418 -40.2144 -9.98628 \n", + "t = 4.7, y[140] = -78.442, x_hat[140] = -78.477 -40.5416 -9.98476 \n", + "t = 4.73333, y[141] = -79.7878, x_hat[141] = -79.823 -40.8687 -9.98324 \n", + "t = 4.76667, y[142] = -81.1445, x_hat[142] = -81.18 -41.1957 -9.98174 \n", + "t = 4.8, y[143] = -82.5122, x_hat[143] = -82.5478 -41.5227 -9.98025 \n", + "t = 4.83333, y[144] = -83.8907, x_hat[144] = -83.9265 -41.8497 -9.97877 \n", + "\n", + "Exec Success, Final kf states:-83.9265 -41.8497 -9.97877 \n" + ] + } + ], + "source": [ + "std::vector> g_res = run_kf(true);" + ] + }, + { + "cell_type": "markdown", + "id": "835df3bf-9017-4a24-ac29-8e9d821bb881", + "metadata": {}, + "source": [ + "### Defining a C++ function to resolve the 2D vector result *{Kalman Filter Estimates, Kalman Gain}*\n", + "\n", + "- The function returns the KF estimates if the provided axis is 0 and Kalman Gains if axis = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "2a47ecf6-f4e9-44b6-b72c-a935ed02fdc1", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector ret_1d_vector(std::vector> res, int axis) {\n", + " std::vector ret;\n", + " for (int i = 0; i < res.size(); i++) {\n", + " ret.push_back(res[i][axis]);\n", + " }\n", + " return ret;\n", + "}\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "1566d4bf-3952-4616-8b45-b0231c5be9f5", + "metadata": {}, + "source": [ + "#### We obtain the final results as two std::vectors that can be accessed in Python:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "4e3c7506-8fb8-42f0-b5af-e8950d088a3d", + "metadata": {}, + "outputs": [], + "source": [ + "std::vector py_g_pred = ret_1d_vector(g_res, 0);\n", + "std::vector py_kf_gains = ret_1d_vector(g_res, 1);" + ] + }, + { + "cell_type": "markdown", + "id": "2f7b1159-8a34-41c2-99d0-56a7e5714cab", + "metadata": {}, + "source": [ + "### Lets plot the trend of Kalman Gain across the time steps to verify its working\n", + "\n", + "If the algorithm converged properly, we should see an exponential decrease in the gain with time" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "129a199c-70b0-454b-a70d-68373c4786f2", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "import matplotlib.pyplot as plt\n", + "\n", + "kalman_gains = list(cppyy.gbl.py_kf_gains)\n", + "x = range(len(kalman_gains))" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "f7bcd055-7470-42c9-aad8-238ff3d3566e", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "plt.figure()\n", + "plt.plot(x, kalman_gains, color='blue', marker='v')\n", + " \n", + "plt.xlabel('Time Steps')\n", + "plt.ylabel('Kalman Gain')\n", + "plt.title('Kalman Gain Plot')\n", + "plt.savefig(\"kalman_gain_plot.png\")\n", + " \n", + "plt.yscale('symlog')\n", + " \n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "842b0e92-4722-4551-81a6-e3a00229da0f", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "8ae33816-740c-455b-82d2-4cacf29206c1", + "metadata": {}, + "source": [ + "### Now we can plot the values of Gravitational Acceleration *(g)* obtained as Kalman Estimates :" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "8801e6da-b52b-46b1-b895-e138bbaa39c0", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "true_val = 9.81\n", + "g_pred = list(cppyy.gbl.py_g_pred)\n", + "g_pred = list(-x for x in g_pred)\n", + " \n", + "x = range(len(g_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "09889e69-195e-44c0-87ee-a82616492101", + "metadata": {}, + "outputs": [], + "source": [ + "%%python\n", + "\n", + "plt.figure()\n", + "plt.axhline(y=true_val, color='green', linestyle='-')\n", + "plt.plot(x, g_pred, color='orange', marker='o', label='KF Estimates')\n", + "plt.annotate(f'{true_val}', xy=(-0.5, true_val), color='green',\n", + " verticalalignment='center', horizontalalignment = 'left')\n", + " \n", + "plt.xlabel('Index')\n", + "plt.ylabel('Acceleration (m/s₂)')\n", + "plt.title('True Value vs. g_pred')\n", + "plt.legend()\n", + "plt.savefig(\"1D_KF_plot.png\")\n", + " \n", + "plt.yscale('symlog')\n", + " \n", + "plt.show()\n" + ] + }, + { + "cell_type": "markdown", + "id": "96887ee7-3a13-4440-8f27-8e93b5bc96db", + "metadata": {}, + "source": [ + "### Result : The Kalman Filter begins to converge on the value of 9.81" + ] + }, + { + "cell_type": "markdown", + "id": "6596ee2f-a810-4e5c-a4c7-5634d71ec608", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CUDA (C++17)", + "language": "CUDA", + "name": "cuda-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/omp/hello_world.ipynb b/notebooks/omp/hello_world.ipynb new file mode 100644 index 00000000..d0764753 --- /dev/null +++ b/notebooks/omp/hello_world.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "73cbab37-71dd-477d-981b-f2ec28c01bd6", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ef1cd58a-672c-4a6f-843a-6c88fc4911f3", + "metadata": {}, + "outputs": [], + "source": [ + "Cpp::LoadLibrary(\"libomp\");\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c2b754ad-9553-4a42-b990-f990a9a269ed", + "metadata": {}, + "outputs": [], + "source": [ + "int main() {\n", + " int max_threads = omp_get_max_threads();\n", + "\n", + " printf(\"max threads: %d\\n\", max_threads);\n", + " omp_set_num_threads(max_threads);\n", + "\n", + "#pragma omp parallel\n", + " {\n", + " int id = omp_get_thread_num();\n", + " printf(\"Hello World from thread = %d with %d threads\\n\", id, omp_get_num_threads());\n", + " }\n", + "\n", + " printf(\"all done, with hopefully %d threads\\n\", max_threads);\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a37a13d4-fc82-496e-8f42-9e718a8c2aa0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "max threads: 8\n", + "Hello World from thread = 2 with 8 threads\n", + "Hello World from thread = 0 with 8 threads\n", + "Hello World from thread = 1 with 8 threads\n", + "Hello World from thread = 7 with 8 threads\n", + "Hello World from thread = 4 with 8 threads\n", + "Hello World from thread = 5 with 8 threads\n", + "Hello World from thread = 3 with 8 threads\n", + "Hello World from thread = 6 with 8 threads\n", + "all done, with hopefully 8 threads\n" + ] + } + ], + "source": [ + "main();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "af54945c-2412-4aee-bbfc-a2f6fc72d23f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++17 (+OpenMP)", + "language": "C++17", + "name": "omp-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/omp/linked_list.ipynb b/notebooks/omp/linked_list.ipynb new file mode 100644 index 00000000..106cecd6 --- /dev/null +++ b/notebooks/omp/linked_list.ipynb @@ -0,0 +1,224 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "156447d2-9279-45a0-890b-4e519d2c796b", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c96fdeb0-817d-48c0-af8e-20a52947d60b", + "metadata": {}, + "outputs": [], + "source": [ + "#ifndef N\n", + "#define N 5\n", + "#endif\n", + "#ifndef FS\n", + "#define FS 38\n", + "#endif" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8da842e1-db02-49e0-929d-4e67cbc08172", + "metadata": {}, + "outputs": [], + "source": [ + "Cpp::LoadLibrary(\"libomp\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22f97c49-78d1-496e-ac7c-978aed95331a", + "metadata": {}, + "outputs": [], + "source": [ + "struct node {\n", + " int data;\n", + " int fibdata;\n", + " struct node *next;\n", + "};" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b16b1e8a-8831-4b8d-9d57-09deeaaa88ee", + "metadata": {}, + "outputs": [], + "source": [ + "struct node *init_list(struct node *p);\n", + "void processwork(struct node *p);\n", + "int fib(int n);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ef8af6c-1d6f-4c68-84bc-3dd1d8092b06", + "metadata": {}, + "outputs": [], + "source": [ + "int fib(int n) {\n", + " int x, y;\n", + " if (n < 2) {\n", + " return (n);\n", + " } else {\n", + " x = fib(n - 1);\n", + " y = fib(n - 2);\n", + " return (x + y);\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1fa0307d-fdc9-4503-95cb-1c6448791354", + "metadata": {}, + "outputs": [], + "source": [ + "void processwork(struct node *p) {\n", + " int n, temp;\n", + " n = p->data;\n", + " temp = fib(n);\n", + "\n", + " p->fibdata = temp;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03acb599-9329-49ff-8aff-c0902adb6c3c", + "metadata": {}, + "outputs": [], + "source": [ + "struct node *init_list(struct node *p) {\n", + " int i;\n", + " struct node *head = NULL;\n", + " struct node *temp = NULL;\n", + "\n", + " head = (struct node*) malloc(sizeof(struct node));\n", + " p = head;\n", + " p->data = FS;\n", + " p->fibdata = 0;\n", + " for (i = 0; i < N; i++) {\n", + " temp = (struct node*) malloc(sizeof(struct node));\n", + " p->next = temp;\n", + " p = temp;\n", + " p->data = FS + i + 1;\n", + " p->fibdata = i + 1;\n", + " }\n", + "\n", + " p->next = NULL;\n", + " return head;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f2dfb41b-e55f-43c0-b7f6-546a1697acb1", + "metadata": {}, + "outputs": [], + "source": [ + "int main() {\n", + " double start, end;\n", + " struct node *p = NULL;\n", + " struct node *temp = NULL;\n", + " struct node *head = NULL;\n", + "\n", + " printf(\"Process linked list\\n\");\n", + " printf(\" Each linked list node will be processed by function 'processwork()'\\n\");\n", + " printf(\" Each ll node will compute %d fibonacci numbers beginning with %d\\n\", N, FS);\n", + "\n", + " omp_set_num_threads(omp_get_max_threads());\n", + "\n", + " p = init_list(p);\n", + " head = p;\n", + "\n", + " start = omp_get_wtime();\n", + "\n", + "#pragma omp parallel\n", + " {\n", + "#pragma omp master\n", + " printf(\"Threads: %d\\n\", omp_get_num_threads());\n", + "\n", + "#pragma omp single\n", + " {\n", + " p = head;\n", + " while (p) {\n", + "#pragma omp task firstprivate(p) // first private is required\n", + " {\n", + " processwork(p);\n", + " }\n", + " p = p->next;\n", + " }\n", + " }\n", + " }\n", + "\n", + " end = omp_get_wtime();\n", + " p = head;\n", + " while (p != NULL) {\n", + " printf(\"%d : %d\\n\", p->data, p->fibdata);\n", + " temp = p->next;\n", + " free(p);\n", + " p = temp;\n", + " }\n", + "\n", + " free(p);\n", + " printf(\"Compute Time: %f seconds\\n\", end - start);\n", + "\n", + " return 0;\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "353e5dfd-fcae-43e6-97e3-ec98070811a1", + "metadata": {}, + "outputs": [], + "source": [ + "main();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a24f21c1-2ddc-4e46-b9ee-7cc98fc2821a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++14", + "language": "C++14", + "name": "xcpp14" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/omp/mandel.ipynb b/notebooks/omp/mandel.ipynb new file mode 100644 index 00000000..7c499cfa --- /dev/null +++ b/notebooks/omp/mandel.ipynb @@ -0,0 +1,132 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5059dbdd-821d-498a-8716-eb0fcf8a8f5f", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b66f96a-14ef-4f23-8024-bcfc42b31e4e", + "metadata": {}, + "outputs": [], + "source": [ + "#define NPOINTS 1000\n", + "#define MAXITER 1000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d89dd57c-fe19-4233-a33a-df9b24fae98a", + "metadata": {}, + "outputs": [], + "source": [ + "int numoutside = 0;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c35c479-2f79-46b7-bc66-24be6b1694e0", + "metadata": {}, + "outputs": [], + "source": [ + "void testpoint(double creal, double cimag) {\n", + " // iterate z=z*z+c, until |z| > 2 when point is known to be outside set\n", + " // If loop count reaches MAXITER, point is considered to be inside the set\n", + "\n", + " double zreal, zimag, temp;\n", + " int iter;\n", + " zreal = creal;\n", + " zimag = cimag;\n", + "\n", + " for (iter = 0; iter < MAXITER; iter++) {\n", + " temp = (zreal * zreal) - (zimag * zimag) + creal;\n", + " zimag = zreal * zimag * 2 + cimag;\n", + " zreal = temp;\n", + " if ((zreal * zreal + zimag * zimag) > 4.0) {\n", + " numoutside++;\n", + " break;\n", + " }\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea116fef-7d05-4e29-97a1-55c85c7241d8", + "metadata": {}, + "outputs": [], + "source": [ + "int main() {\n", + " int i, j;\n", + " double area, error, eps = 1.0e-5;\n", + " double cimag, creal;\n", + " // Loop over grid of points in the complex plane which contains the Mandelbrot set,\n", + " // testing each point to see whether it is inside or outside the set.\n", + "\n", + "#pragma omp parallel for private(eps)\n", + " for (i = 0; i < NPOINTS; i++) {\n", + " for (j = 0; j < NPOINTS; j++) {\n", + " creal = -2.0 + 2.5 * (double) (i) / (double) (NPOINTS) + eps;\n", + " cimag = 1.125 * (double) (j) / (double) (NPOINTS) + eps;\n", + " testpoint(creal, cimag);\n", + " }\n", + " }\n", + "\n", + " // Calculate area of set and error estimate and output the results\n", + " area = 2.0 * 2.5 * 1.125 * (double) (NPOINTS * NPOINTS - numoutside) / (double) (NPOINTS * NPOINTS);\n", + " error = area / (double) NPOINTS;\n", + "\n", + " printf(\"Area of Mandlebrot set = %12.8f +/- %12.8f\\n\", area, error);\n", + " printf(\"Correct answer should be around 1.510659\\n\");\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "39cf129c-8106-4e67-a2f1-1a7fff17cd38", + "metadata": {}, + "outputs": [], + "source": [ + "main();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7bcf082a-e0a5-4260-8729-cbfab515cc6a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++14", + "language": "C++14", + "name": "xcpp14" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/omp/pi_integral.ipynb b/notebooks/omp/pi_integral.ipynb new file mode 100644 index 00000000..50b9b88d --- /dev/null +++ b/notebooks/omp/pi_integral.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d8967ce2-994c-441e-b796-4099c6c6853c", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93b89565-44fe-4729-980b-d4f897161b0b", + "metadata": {}, + "outputs": [], + "source": [ + "Cpp::LoadLibrary(\"libomp\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9078ac79-ca50-4fef-b785-37f35fec3cab", + "metadata": {}, + "outputs": [], + "source": [ + "static long num_steps = 100000000;\n", + "double step;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3c10995-6f29-4d71-9e61-1993ca9d1cc9", + "metadata": {}, + "outputs": [], + "source": [ + "int main() {\n", + " int i, j, num_threads_allocated;\n", + " double x, pi, sum = 0.0;\n", + " double start_time, run_time;\n", + "\n", + " step = 1.0 / (double)num_steps;\n", + " printf(\"Num threads available: %d\\n\", omp_get_max_threads());\n", + " for (i = 1; i <= 4; i++) {\n", + " sum = 0.0;\n", + " omp_set_num_threads(i);\n", + " start_time = omp_get_wtime();\n", + "#pragma omp parallel\n", + " {\n", + " num_threads_allocated = omp_get_num_threads();\n", + "#pragma omp single\n", + " printf(\"Num threads allocated for this run: %d\\n\", num_threads_allocated);\n", + "\n", + "#pragma omp for reduction(+ : sum)\n", + " for (j = 1; j <= num_steps; j++) {\n", + " x = (j - 0.5) * step;\n", + " sum = sum + 4.0 / (1.0 + x * x);\n", + " }\n", + " }\n", + "\n", + " pi = step * sum;\n", + " run_time = omp_get_wtime() - start_time;\n", + " printf(\"pi is %f in %f seconds using %d threads\\n\\n\", pi, run_time, num_threads_allocated);\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f84442a-d947-4860-bd3c-aeeea963b419", + "metadata": {}, + "outputs": [], + "source": [ + "main();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7066da1-729b-4230-980f-9be9430c19d6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "C++14", + "language": "C++14", + "name": "xcpp14" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "14" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/openmp-demo.ipynb b/notebooks/openmp-demo.ipynb new file mode 100644 index 00000000..a9ac4a2a --- /dev/null +++ b/notebooks/openmp-demo.ipynb @@ -0,0 +1,162 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "b0c15570-ee24-42ed-b61f-11a3fc858b2d", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \"omp.h\"\n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b4dac8e-3ad2-46eb-b801-ba717e664b93", + "metadata": {}, + "outputs": [], + "source": [ + "Cpp::LoadLibrary(\"libomp\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5001e441-1fa5-4bdc-9fa5-2ca103ae484f", + "metadata": {}, + "outputs": [], + "source": [ + "void example1() {\n", + " std::cout << \"Hello World!\" << std::endl;\n", + "}\n", + "example1();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "53fb7656-b72e-42bc-ade7-2ae2077142da", + "metadata": {}, + "outputs": [], + "source": [ + "void example2() {\n", + " #pragma omp parallel\n", + " {\n", + " std::cout << \"Hello World!\" << std::endl;\n", + " }\n", + "}\n", + "example2();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "efcdfdb6-a60b-46af-8194-75ef9cc0e27f", + "metadata": {}, + "outputs": [], + "source": [ + "void example3() {\n", + " #pragma omp parallel\n", + " {\n", + " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + "}\n", + "example3();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d86a9efa-ba28-4cb6-bbfc-abc00ee63506", + "metadata": {}, + "outputs": [], + "source": [ + "void example4() {\n", + " #pragma omp parallel\n", + " {\n", + " std::cout << \"Hello World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + "\n", + " std::cout << \"This is another message! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + "\n", + " #pragma omp parallel num_threads(2)\n", + " {\n", + " std::cout << \"Goodbye World! (\" << omp_get_thread_num() << \")\" << std::endl;\n", + " }\n", + "}\n", + "example4();" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5557e01a-7c7d-4b54-8545-962ad11027df", + "metadata": {}, + "outputs": [], + "source": [ + "void example5() {\n", + " double start_time = omp_get_wtime();\n", + " double start_loop;\n", + " \n", + " const int N = 1000000000;\n", + " int* a = new int[N];\n", + " int* b = new int[N];\n", + " \n", + " start_loop = omp_get_wtime();\n", + " #pragma omp parallel for\n", + " for (int i=0; i\n", + "#include \n", + "#include \"cuda_runtime.h\"\n", + "\n", + "inline __device__ __host__ float clamp(float f, float a, float b)\n", + "{\n", + " return fmaxf(a, fminf(f, b));\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "29109b8c-ac5e-4826-aa48-6e167a48c059", + "metadata": {}, + "source": [ + "### Image helper" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0c76fca2-0878-4b0d-bea4-16de34b31cf5", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "\n", + "#ifdef __unix\n", + "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n", + "#endif\n", + "\n", + "inline uint8_t ToByte(float color, float gamma = 2.2f) noexcept\n", + "{\n", + " const float gcolor = std::pow(color, 1.0 / gamma);\n", + " return static_cast(clamp(255.0f * gcolor, 0.0f, 255.0f));\n", + "}\n", + "\n", + "inline void WritePPM(uint32_t w, uint32_t h, \n", + " const float3* Ls, \n", + " const char* fname = \"smallptCuda.ppm\") noexcept\n", + "{\n", + " FILE* fp;\n", + " \n", + " fopen_s(&fp, fname, \"w\");\n", + " \n", + " std::fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n", + " for (std::size_t i = 0; i < w * h; ++i) {\n", + " std::fprintf(fp, \"%u %u %u \", \n", + " ToByte(Ls[i].x), \n", + " ToByte(Ls[i].y), \n", + " ToByte(Ls[i].z));\n", + " }\n", + " \n", + " std::fclose(fp);\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "caa7fb92-d64d-45b9-8d4f-9f9d81e75bfa", + "metadata": { + "tags": [] + }, + "source": [ + "# SmallPT CUDA" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd12e5f-59ed-4367-8d22-65838179dc33", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "#include \n", + "\n", + "///#include \n", + "///#include \n", + "///#include \n", + "///#include \n", + "\n", + "#define PI 3.14159265359f\n", + "\n", + "#define CUDA_SAFE_CALL(call) { \\\n", + "cudaError err = ( call); \\\n", + "if(cudaSuccess != err) { \\\n", + " fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\", \\\n", + " __FILE__, __LINE__, cudaGetErrorString( err) ); \\\n", + "} }\n", + "\n", + "// -----------------------------------GPU Func-----------------------------------\n", + "// From [smallpt](http://www.kevinbeason.com/smallpt/)\n", + "enum materialType\n", + "{ \n", + " DIFFUSE = 0, \n", + " MIRROR, \n", + " GLASS\n", + "};\n", + "\n", + "struct __align__(16) Ray\n", + "{\n", + " __device__ Ray() {}\n", + "\n", + " __device__ Ray(float3 origin, float3 direction) \n", + " : origin(origin), direction(direction) {}\n", + "\n", + " float3 origin;\n", + " float3 direction;\n", + "};\n", + "\n", + "struct __align__(16) sphere\n", + "{\n", + " float radius;\n", + " float3 center, emission, reflectance;\n", + " materialType type;\n", + "\n", + " __device__ double intersect(const Ray &r) const\n", + " {\n", + "\n", + " float3 op = center - r.origin;\n", + " float t, epsilon = 0.0001f; // epsilon required to prevent floating point precision artefacts\n", + " float b = dot(op, r.direction); // b in quadratic equation\n", + " float disc = b*b - dot(op, op) + radius*radius; // discriminant quadratic equation\n", + " if(disc < 0) return 0; // if disc < 0, no real solution (we're not interested in complex roots) \n", + " else disc = sqrtf(disc); // if disc >= 0, check for solutions using negative and positive discriminant\n", + " return (t = b - disc) > epsilon ? t : ((t = b + disc) > epsilon ? t : 0); // pick closest point in front of ray origin\n", + " }\n", + "};\n", + "\n", + "__constant__ sphere spheres[] = {\n", + " {1e5f,{1e5f + 1.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{0.75f, 0.25f, 0.25f}, DIFFUSE}, //Left \n", + " {1e5f,{-1e5f + 99.0f, 40.8f, 81.6f},{0.0f, 0.0f, 0.0f},{.25f, .25f, .75f}, DIFFUSE}, //Rght \n", + " {1e5f,{50.0f, 40.8f, 1e5f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Back \n", + " {1e5f,{50.0f, 40.8f, -1e5f + 170.0f},{0.0f, 0.0f, 0.0f},{0.0f, 0.0f, 0.0f}, DIFFUSE}, //Frnt \n", + " {1e5f,{50.0f, 1e5f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Botm \n", + " {1e5f,{50.0f, -1e5f + 81.6f, 81.6f},{0.0f, 0.0f, 0.0f},{.75f, .75f, .75f}, DIFFUSE}, //Top \n", + " {16.5f,{27.0f, 16.5f, 47.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, MIRROR},//Mirr\n", + " {16.5f,{73.0f, 16.5f, 78.0f},{0.0f, 0.0f, 0.0f},{1, 1, 1}, GLASS},//Glas\n", + " {600.0f,{50.0f, 681.6f-.27f, 81.6f},{12, 12, 12},{0.0f, 0.0f, 0.0f}, DIFFUSE} // Light\n", + "};\n", + "\n", + "__constant__ const int nsphere = sizeof(spheres) / sizeof(sphere);\n", + "\n", + "__device__ float rgbToLuminance(const float3& rgb)\n", + "{\n", + " const float YWeight[3] = {0.212671f, 0.715160f, 0.072169f};\n", + " return YWeight[0] * rgb.x + YWeight[1] * rgb.y + YWeight[2] * rgb.z;\n", + "}\n", + "\n", + "__device__ bool intersectScene(const Ray &r, float &t, int &id, sphere* pshere, int &nsp)\n", + "{\n", + " float d, inf = t = 1e20; // t is distance to closest intersection, initialise t to a huge number outside scene\n", + " for(int i = nsp; i--;)\n", + " {\n", + " // find closest hit object and point\n", + " if((d = pshere[i].intersect(r)) && d < t)\n", + " {\n", + " t = d;\n", + " id = i;\n", + " }\n", + " }\n", + " \n", + " return t < inf; // returns true if an intersection with the scene occurred, false when no hit\n", + "}\n", + "\n", + "__device__ float clamp(float x) { return x < 0 ? 0 : x>1 ? 1 : x; }\n", + "\n", + "__device__ float gammaCorrection(float x)\n", + "{\n", + " return pow(clamp(x), 1 / 2.2f);\n", + "}\n", + "\n", + "__device__ float3 radiance(Ray &r, curandState* rs, sphere* pshere, int &nsp)\n", + "{\n", + " float3 L = make_float3(0.0f, 0.0f, 0.0f); // accumulates ray colour with each iteration through bounce loop\n", + " float3 throughput = make_float3(1.0f, 1.0f, 1.0f);\n", + " int depth = 0;\n", + "\n", + " // ray bounce loop\n", + " while(1)\n", + " {\n", + " float t; \n", + " int id = 0; \n", + "\n", + " // find closest intersection with object's index\n", + " if(!intersectScene(r, t, id, pshere, nsp))\n", + " break;\n", + "\n", + " const sphere &obj = pshere[id];\n", + " float3 hitpoint = r.origin + r.direction * t; \n", + " float3 normal = normalize(hitpoint - obj.center);\n", + " float3 nl = dot(normal, r.direction) < 0 ? normal : normal * -1; // front facing normal\n", + "\n", + " // prevent self-intersection\n", + " r.origin = hitpoint + nl * 0.05f;\n", + "\n", + " //float pdf = 1.0f;\n", + "\n", + " // add emission\n", + " L += throughput * obj.emission;\n", + "\n", + " // different material\n", + " if(obj.type == DIFFUSE)\n", + " { \n", + " // uniform sampling hemisphere\n", + " float r1 = 2 * PI * curand_uniform(rs);\n", + " float r2 = curand_uniform(rs);\n", + " float r2s = sqrtf(r2);\n", + "\n", + " // compute local coordinate on the hit point\n", + " float3 w = nl;\n", + " float3 u = normalize(cross((fabs(w.x) > .1 ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));\n", + " float3 v = cross(w, u);\n", + "\n", + " // local to world convert\n", + " r.direction = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));\n", + " //pdf = 1.0f / PI;\n", + "\n", + " // importance sampling no need costheta\n", + " //throughput *= obj.reflectance * dot(r.direction, nl);\n", + " throughput *= obj.reflectance;\n", + " }\n", + " else if(obj.type == MIRROR)\n", + " {\n", + " r.direction = r.direction - normal * 2 * dot(normal, r.direction);\n", + " throughput *= obj.reflectance;\n", + " //pdf = 1.0f;\n", + " }\n", + " else\n", + " {\n", + " r.origin = hitpoint;\n", + "\n", + " // Ideal dielectric REFRACTION\n", + " float3 reflectDir = r.direction - normal * 2 * dot(normal, r.direction);\n", + " // Ray from outside going in?\n", + " bool into = dot(normal, nl) > 0;\n", + " float nc = 1, nt = 1.5, nnt = into ? nc / nt : nt / nc, ddn = dot(r.direction, nl), cos2t;\n", + " \n", + " // total internal reflection\n", + " if((cos2t = 1 - nnt*nnt*(1 - ddn*ddn)) < 0)\n", + " {\n", + " r.direction = reflectDir;\n", + " throughput *= obj.reflectance;\n", + " }\n", + " else\n", + " {\n", + " // refract or reflect\n", + " float3 tdir = normalize(r.direction*nnt - normal*((into ? 1 : -1)*(ddn*nnt + sqrt(cos2t))));\n", + "\n", + " float a = nt - nc, b = nt + nc, R0 = a*a / (b*b), c = 1 - (into ? -ddn : dot(tdir, normal));\n", + "\n", + " float Re = R0 + (1 - R0)*c*c*c*c*c, Tr = 1 - Re, P = .25 + .5*Re, RP = Re / P, TP = Tr / (1 - P);\n", + " \n", + " if(curand_uniform(rs) < P)\n", + " {\n", + " // reflect\n", + " r.direction = reflectDir;\n", + " throughput *= obj.reflectance * RP;\n", + " }\n", + " else\n", + " {\n", + " //refract\n", + " r.direction = tdir;\n", + " throughput *= obj.reflectance * TP;\n", + " //throughput *= make_float3(1, 0, 0);\n", + " }\n", + " }\n", + " }\n", + "\n", + " // Russian roulette Stop with at least some probability to avoid getting stuck\n", + " if(depth++ >= 5)\n", + " {\n", + " float q = min(0.95f, rgbToLuminance(throughput));\n", + " if(curand_uniform(rs) >= q)\n", + " break;\n", + " throughput /= q;\n", + " }\n", + " }\n", + "\n", + " return L;\n", + "}\n", + "\n", + "__global__ void render1(int width, int height)\n", + "{\n", + " printf(\"!1!\\n\");\n", + "}\n", + "\n", + "__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)\n", + "{ \n", + " printf(\"!!!\\n\");\n", + "\n", + " int i = blockDim.x * blockIdx.x + threadIdx.x;\n", + "\n", + " if (i < numElements) {\n", + " C[i] = A[i] + B[i] + 0.0f;\n", + " }\n", + "}\n", + "\n", + "__global__ void render(int spp, int width, int height, float3* output)\n", + "{\n", + " printf(\"!!!\\n\");\n", + " \n", + " //copy spheres to shared memory\n", + " __shared__ int nsp;\n", + " __shared__ sphere sspheres[nsphere];\n", + " __shared__ Ray tRay;\n", + " nsp = nsphere;\n", + "\n", + " sspheres[threadIdx.x % nsp] = spheres[threadIdx.x % nsp];\n", + "\n", + " __syncthreads();\n", + "\n", + " // position of current pixel\n", + " int x = blockIdx.x * blockDim.x + threadIdx.x;\n", + " int y = blockIdx.y * blockDim.y + threadIdx.y;\n", + "\n", + " // index of current pixel\n", + " //int i = (blockIdx.x + blockIdx.y * gridDim.x) * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;\n", + " int i = (height - y - 1) * width + x;\n", + "\n", + " curandState rs;\n", + " curand_init(i, 0, 0, &rs);\n", + "\n", + " Ray cam(make_float3(50, 52, 295.6), normalize(make_float3(0, -0.042612, -1))); // cam pos, dir \n", + " float3 cx = make_float3(width * 0.5135f / height, 0.0f, 0.0f);\n", + " // .5135 is field of view angle\n", + " float3 cy = normalize(cross(cx, cam.direction)) * 0.5135f;\n", + " float3 color = make_float3(0.0f);\n", + "\n", + " for (int sy = 0; sy < 2; sy++)\n", + " {\n", + " for (int sx = 0; sx < 2; sx++)\n", + " { \n", + " for(int s = 0; s < spp; s++)\n", + " {\n", + " float r1 = curand_uniform(&rs);\n", + " float dx = r1 < 1 ? sqrtf(r1) - 1 : 1-sqrtf(2 - r1);\n", + " float r2 = curand_uniform(&rs);\n", + " float dy = r2 < 1 ? sqrtf(r2) - 1 : 1-sqrtf(2 - r2);\n", + " //--! super sampling\n", + " float3 d = cam.direction + cx*((((sx + dx + .5) / 2) + x) / width - .5) + \n", + " cy*((((sy + dy + .5) / 2) + y) / height - .5);\n", + "\n", + " //Ray tRay = Ray(cam.origin + d * 140, normalize(d));\n", + " tRay.direction = normalize(d);\n", + " tRay.origin = cam.origin + d * 140;\n", + " color += radiance(tRay, &rs, sspheres, nsp) *(.25f / spp);\n", + " }\n", + " }\n", + " }\n", + "\n", + " // output to the cache\n", + " __shared__ float3 temp;\n", + " temp = make_float3(clamp(color.x, 0.0f, 1.0f), clamp(color.y, 0.0f, 1.0f), clamp(color.z, 0.0f, 1.0f));\n", + " output[i] = temp;\n", + "}\n", + "\n", + "// -----------------------------------CPU Func-----------------------------------\n", + "\n", + "void devicePropertyPrint()\n", + "{\n", + " int dev = 0;\n", + " cudaDeviceProp devProp;\n", + " if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n", + " {\n", + " printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n", + " printf(\"Device compute capability: %i.%i\\n\", devProp.major, devProp.minor);\n", + " printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n", + " printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n", + " printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n", + " printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n", + " printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n", + " printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n", + " printf(\"\\n\");\n", + " }\n", + "}\n", + "\n", + "void Render(const std::uint32_t nb_samples) {\n", + "\n", + " // Image Size\n", + " //int width = 1024, height = 768;\n", + " int width = 256, height = 192;\n", + " int spp = nb_samples/4;\n", + "\n", + " printf(\"\\nRendering Size: [%d, %d], spp: %d\\n\", width, height, spp);\n", + " printf(\"------------------Rendering Started------------------\\n\");\n", + "\n", + " //sTimer t;\n", + " \n", + " // Memory on CPU\n", + " float3* outputCPU = new float3[width * height];\n", + " float3* outputGPU;\n", + " CUDA_SAFE_CALL(cudaMalloc(&outputGPU, width * height * sizeof(float3)));\n", + "\n", + " // Ray Pool\n", + " dim3 blockSize(32, 32, 1);\n", + " dim3 gridSize(width / blockSize.x, height / blockSize.y, 1);\n", + "\n", + " //t.start();\n", + "\n", + " \n", + " cudaError_t err = cudaSuccess;\n", + " int numElements = 50000;\n", + " size_t size = numElements * sizeof(float);\n", + " float *d_A = NULL;\n", + " float *d_B = NULL;\n", + " float *d_C = NULL;\n", + " err = cudaMalloc((void **)&d_A, size);\n", + " cudaMalloc((void **)&d_B, size);\n", + " cudaMalloc((void **)&d_C, size);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to allocate device vector A (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " CUDA_SAFE_CALL(cudaDeviceSynchronize());\n", + " \n", + " // Render on GPU\n", + "printf(\"@1@\\n\");\n", + " vectorAdd<<>>(d_A, d_B, d_C, numElements);\n", + "//printf(\"@1@\\n\");\n", + " //render<<>>(spp, width, height, outputGPU);\n", + " CUDA_SAFE_CALL(cudaPeekAtLastError());\n", + " CUDA_SAFE_CALL(cudaDeviceSynchronize());\n", + " \n", + " //t.end();\n", + "\n", + " // Copy Mem from GPU to CPU\n", + " CUDA_SAFE_CALL(cudaMemcpy(outputCPU, outputGPU, width * height * sizeof(float3), cudaMemcpyDeviceToHost));\n", + "\n", + " // free CUDA memory\n", + " cudaFree(outputGPU);\n", + "\n", + " printf(\"------------------Rendering Ended------------------\\n\");\n", + " //printf(\"Cost time: %f\\n\", t.difference());\n", + "\n", + " WritePPM(width, height, outputCPU);\n", + "}\n", + "\n", + "const std::uint32_t nb_samples = 40;\n", + "devicePropertyPrint();\n", + "Render(nb_samples);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CppInterOp (C++17)", + "language": "CUDA", + "name": "cppinterop-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "C++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/vectorAddCuda.ipynb b/notebooks/vectorAddCuda.ipynb new file mode 100644 index 00000000..e339d051 --- /dev/null +++ b/notebooks/vectorAddCuda.ipynb @@ -0,0 +1,1016 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "57b6c4aa-b3a2-4a0e-9192-cc84e2215f96", + "metadata": {}, + "source": [ + "# CUDA Vector Add Demo\n", + "\n", + "Adapted from https://github.com/NVIDIA/cuda-samples.git" + ] + }, + { + "cell_type": "markdown", + "id": "d80193b1-362f-4b64-b8ef-c51d0405e12f", + "metadata": { + "tags": [] + }, + "source": [ + "## Main helpers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d1fefca-10c2-4fed-9c77-cec0882808b0", + "metadata": {}, + "outputs": [], + "source": [ + "#include \n", + "#include \n", + "\n", + "///\n", + "\n", + "//#include \"cuda_runtime.h\"\n", + "#include \"curand_kernel.h\"\n", + "\n", + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "#include \n", + "#include " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "597e32ba-8759-4a94-aa25-3b8d11ff99b8", + "metadata": {}, + "outputs": [], + "source": [ + " constexpr double g_pi = 3.14159265358979323846;\n", + "\n", + " __host__ __device__ inline double Clamp(double v, \n", + " double low = 0.0, double high = 1.0) noexcept\n", + " {\n", + " return fmin(fmax(v, low), high);\n", + " }\n", + "\n", + " inline std::uint8_t ToByte(double color, double gamma = 2.2) noexcept\n", + " {\n", + " const double gcolor = std::pow(color, 1.0 / gamma);\n", + " return static_cast(Clamp(255.0 * gcolor, 0.0, 255.0));\n", + " }\n", + "\n", + " inline void HandleError(cudaError_t err, const char* file, int line)\n", + " {\n", + " if (cudaSuccess != err) {\n", + " std::printf(\"%s in %s at line %d\\n\", \n", + " cudaGetErrorString(err), file, line);\n", + " std::exit(EXIT_FAILURE);\n", + " }\n", + " }\n", + "//}\n", + "\n", + "#define CUDA_SAFE_CALL(call) { \\\n", + "cudaError err = ( call); \\\n", + "if(cudaSuccess != err) { \\\n", + " fprintf(stderr, \"Cuda error in file '%s' in line %i : %s.\\n\", \\\n", + " __FILE__, __LINE__, cudaGetErrorString( err) ); \\\n", + "} }\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f4554eb-e09c-4aac-9c5d-486837177a01", + "metadata": {}, + "outputs": [], + "source": [ + "//namespace smallpt {\n", + "\n", + " struct Vector3 {\n", + "\n", + " //public:\n", + "\n", + " __host__ __device__ explicit Vector3(double xyz = 0.0) noexcept\n", + " : Vector3(xyz, xyz, xyz) {}\n", + " __host__ __device__ Vector3(double x, double y, double z) noexcept\n", + " : m_x(x), m_y(y), m_z(z) {}\n", + " Vector3(const Vector3& v) noexcept = default;\n", + " Vector3(Vector3&& v) noexcept = default;\n", + " ~Vector3() = default;\n", + "\n", + " Vector3& operator=(const Vector3& v) = default;\n", + " Vector3& operator=(Vector3&& v) = default;\n", + "\n", + " __device__ bool HasNaNs() const {\n", + " return std::isnan(m_x) || std::isnan(m_y) || std::isnan(m_z);\n", + " }\n", + "\n", + " __device__ const Vector3 operator-() const {\n", + " return { -m_x, -m_y, -m_z };\n", + " }\n", + "\n", + " __device__ const Vector3 operator+(const Vector3& v) const {\n", + " return { m_x + v.m_x, m_y + v.m_y, m_z + v.m_z };\n", + " }\n", + " __device__ const Vector3 operator-(const Vector3& v) const {\n", + " return { m_x - v.m_x, m_y - v.m_y, m_z - v.m_z };\n", + " }\n", + " __device__ const Vector3 operator*(const Vector3& v) const {\n", + " return { m_x * v.m_x, m_y * v.m_y, m_z * v.m_z };\n", + " }\n", + " __device__ const Vector3 operator/(const Vector3& v) const {\n", + " return { m_x / v.m_x, m_y / v.m_y, m_z / v.m_z };\n", + " }\n", + " __device__ const Vector3 operator+(double a) const {\n", + " return { m_x + a, m_y + a, m_z + a };\n", + " }\n", + " __device__ const Vector3 operator-(double a) const {\n", + " return { m_x - a, m_y - a, m_z - a };\n", + " }\n", + " __device__ const Vector3 operator*(double a) const {\n", + " return { m_x * a, m_y * a, m_z * a };\n", + " }\n", + " __device__ const Vector3 operator/(double a) const {\n", + " const double inv_a = 1.0 / a;\n", + " return { m_x * inv_a, m_y * inv_a, m_z * inv_a };\n", + " }\n", + "\n", + " __device__ Vector3& operator+=(const Vector3& v) {\n", + " m_x += v.m_x;\n", + " m_y += v.m_y;\n", + " m_z += v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator-=(const Vector3& v) {\n", + " m_x -= v.m_x;\n", + " m_y -= v.m_y;\n", + " m_z -= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator*=(const Vector3& v) {\n", + " m_x *= v.m_x;\n", + " m_y *= v.m_y;\n", + " m_z *= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator/=(const Vector3& v) {\n", + " m_x /= v.m_x;\n", + " m_y /= v.m_y;\n", + " m_z /= v.m_z;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator+=(double a) {\n", + " m_x += a;\n", + " m_y += a;\n", + " m_z += a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator-=(double a) {\n", + " m_x -= a;\n", + " m_y -= a;\n", + " m_z -= a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator*=(double a) {\n", + " m_x *= a;\n", + " m_y *= a;\n", + " m_z *= a;\n", + " return *this;\n", + " }\n", + " __device__ Vector3& operator/=(double a) {\n", + " const double inv_a = 1.0 / a;\n", + " m_x *= inv_a;\n", + " m_y *= inv_a;\n", + " m_z *= inv_a;\n", + " return *this;\n", + " }\n", + "\n", + " __device__ double Dot(const Vector3& v) const {\n", + " return m_x * v.m_x + m_y * v.m_y + m_z * v.m_z;\n", + " }\n", + " __device__ const Vector3 Cross(const Vector3& v) const {\n", + " return {\n", + " m_y * v.m_z - m_z * v.m_y,\n", + " m_z * v.m_x - m_x * v.m_z,\n", + " m_x * v.m_y - m_y * v.m_x\n", + " };\n", + " }\n", + "\n", + " __device__ bool operator==(const Vector3& rhs) const {\n", + " return m_x == rhs.m_x && m_y == rhs.m_y && m_z == rhs.m_z;\n", + " }\n", + " __device__ bool operator!=(const Vector3& rhs) const {\n", + " return !(*this == rhs);\n", + " }\n", + "\n", + " __device__ double& operator[](size_t i) {\n", + " return (&m_x)[i];\n", + " }\n", + " __device__ double operator[](size_t i) const {\n", + " return (&m_x)[i];\n", + " }\n", + "\n", + " __device__ size_t MinDimension() const {\n", + " return (m_x < m_y && m_x < m_z) ? 0u : ((m_y < m_z) ? 1u : 2u);\n", + " }\n", + " __device__ size_t MaxDimension() const {\n", + " return (m_x > m_y && m_x > m_z) ? 0u : ((m_y > m_z) ? 1u : 2u);\n", + " }\n", + " __device__ double Min() const {\n", + " return fmin(m_x, fmin(m_y, m_z));\n", + " }\n", + " __device__ double Max() const {\n", + " return fmax(m_x, fmax(m_y, m_z));\n", + " }\n", + "\n", + " __device__ double Norm2_squared() const {\n", + " return m_x * m_x + m_y * m_y + m_z * m_z;\n", + " }\n", + "\n", + " __device__ double Norm2() const {\n", + " return std::sqrt(Norm2_squared());\n", + " }\n", + "\n", + " __device__ void Normalize() {\n", + " const double a = 1.0 / Norm2();\n", + " m_x *= a;\n", + " m_y *= a;\n", + " m_z *= a;\n", + " }\n", + "\n", + " double m_x, m_y, m_z;\n", + " };\n", + "\n", + " __device__ inline const Vector3 operator+(double a, const Vector3& v) {\n", + " return { a + v.m_x, a + v.m_y, a + v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator-(double a, const Vector3& v) {\n", + " return { a - v.m_x, a - v.m_y, a - v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator*(double a, const Vector3& v) {\n", + " return { a * v.m_x, a * v.m_y, a * v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 operator/(double a, const Vector3& v) {\n", + " return { a / v.m_x, a / v.m_y, a / v.m_z };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Sqrt(const Vector3& v) {\n", + " return {\n", + " std::sqrt(v.m_x),\n", + " std::sqrt(v.m_y),\n", + " std::sqrt(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Pow(const Vector3& v, double a) {\n", + " return {\n", + " std::pow(v.m_x, a),\n", + " std::pow(v.m_y, a),\n", + " std::pow(v.m_z, a)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Abs(const Vector3& v) {\n", + " return {\n", + " std::abs(v.m_x),\n", + " std::abs(v.m_y),\n", + " std::abs(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Min(const Vector3& v1, const Vector3& v2) {\n", + " return {\n", + " fmin(v1.m_x, v2.m_x),\n", + " fmin(v1.m_y, v2.m_y),\n", + " fmin(v1.m_z, v2.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Max(const Vector3& v1, const Vector3& v2) {\n", + " return {\n", + " fmax(v1.m_x, v2.m_x),\n", + " fmax(v1.m_y, v2.m_y),\n", + " fmax(v1.m_z, v2.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Round(const Vector3& v) {\n", + " return {\n", + " std::round(v.m_x),\n", + " std::round(v.m_y),\n", + " std::round(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Floor(const Vector3& v) {\n", + " return {\n", + " std::floor(v.m_x),\n", + " std::floor(v.m_y),\n", + " std::floor(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Ceil(const Vector3& v) {\n", + " return {\n", + " std::ceil(v.m_x),\n", + " std::ceil(v.m_y),\n", + " std::ceil(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Trunc(const Vector3& v) {\n", + " return {\n", + " std::trunc(v.m_x),\n", + " std::trunc(v.m_y),\n", + " std::trunc(v.m_z)\n", + " };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Clamp(const Vector3& v, \n", + " double low = 0.0, double high = 1.0) {\n", + " return {\n", + " Clamp(v.m_x, low, high),\n", + " Clamp(v.m_y, low, high),\n", + " Clamp(v.m_z, low, high) }\n", + " ;\n", + " }\n", + "\n", + " __device__ inline const Vector3 Lerp(double a, \n", + " const Vector3& v1, const Vector3& v2) {\n", + " return v1 + a * (v2 - v1);\n", + " }\n", + "\n", + " template\n", + " __device__ inline const Vector3 Permute(const Vector3& v) {\n", + " return { v[X], v[Y], v[Z] };\n", + " }\n", + "\n", + " __device__ inline const Vector3 Normalize(const Vector3& v) {\n", + " const double a = 1.0 / v.Norm2();\n", + " return a * v;\n", + " }\n", + "//}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6426445-b5b5-47a7-82e0-d956b86a0bd9", + "metadata": {}, + "outputs": [], + "source": [ + "#ifdef __unix\n", + "#define fopen_s(pFile,filename,mode) ((*(pFile))=fopen((filename),(mode)))==NULL\n", + "#endif\n", + "\n", + "//namespace smallpt {\n", + "\n", + " inline void WritePPM(uint32_t w, uint32_t h, \n", + " const Vector3* Ls, \n", + " const char* fname = \"cu-image.ppm\") noexcept {\n", + " \n", + " FILE* fp;\n", + " \n", + " fopen_s(&fp, fname, \"w\");\n", + " \n", + " fprintf(fp, \"P3\\n%u %u\\n%u\\n\", w, h, 255u);\n", + " for (size_t i = 0; i < w * h; ++i) {\n", + " fprintf(fp, \"%u %u %u \", \n", + " ToByte(Ls[i].m_x), \n", + " ToByte(Ls[i].m_y), \n", + " ToByte(Ls[i].m_z));\n", + " }\n", + " \n", + " fclose(fp);\n", + " }\n", + "//}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cfd9cc8-9419-4834-9dba-6a8c98e64ba2", + "metadata": {}, + "outputs": [], + "source": [ + "#define REFRACTIVE_INDEX_OUT 1.0\n", + "#define REFRACTIVE_INDEX_IN 1.5\n", + "#define EPSILON_SPHERE 1e-4\n", + "\n", + "//namespace smallpt {\n", + "\n", + " struct __align__(16) Ray {\n", + " __device__ explicit Ray(Vector3 o, \n", + " Vector3 d, \n", + " double tmin = 0.0, \n", + " double tmax = std::numeric_limits< double >::infinity(), \n", + " std::uint32_t depth = 0u) noexcept\n", + " : m_o(std::move(o)),\n", + " m_d(std::move(d)),\n", + " m_tmin(tmin),\n", + " m_tmax(tmax),\n", + " m_depth(depth) {};\n", + " Ray(const Ray& ray) noexcept = default;\n", + " Ray(Ray&& ray) noexcept = default;\n", + " ~Ray() = default;\n", + "\n", + " Ray& operator=(const Ray& ray) = default;\n", + " Ray& operator=(Ray&& ray) = default;\n", + "\n", + " __device__ const Vector3 operator()(double t) const {\n", + " return m_o + m_d * t;\n", + " }\n", + "\n", + " Vector3 m_o, m_d;\n", + " mutable double m_tmin, m_tmax;\n", + " std::uint32_t m_depth;\n", + " };\n", + "\n", + " enum struct Reflection_t {\n", + " Diffuse,\n", + " Specular,\n", + " Refractive\n", + " };\n", + "\n", + " struct __align__(16) Sphere {\n", + " __host__ __device__ explicit Sphere(double r,\n", + " Vector3 p,\n", + " Vector3 e,\n", + " Vector3 f,\n", + " Reflection_t reflection_t) noexcept\n", + " : m_r(r),\n", + " m_p(std::move(p)),\n", + " m_e(std::move(e)),\n", + " m_f(std::move(f)),\n", + " m_reflection_t(reflection_t) {}\n", + " Sphere(const Sphere& sphere) noexcept = default;\n", + " Sphere(Sphere&& sphere) noexcept = default;\n", + " ~Sphere() = default;\n", + "\n", + " Sphere& operator=(const Sphere& sphere) = default;\n", + " Sphere& operator=(Sphere&& sphere) = default;\n", + " \n", + " __device__ bool Intersect(const Ray& ray) const {\n", + " const Vector3 op = m_p - ray.m_o;\n", + " const double dop = ray.m_d.Dot(op);\n", + " const double D = dop * dop - op.Dot(op) + m_r * m_r;\n", + "\n", + " if (0.0 > D) {\n", + " return false;\n", + " }\n", + "\n", + " const double sqrtD = sqrt(D);\n", + "\n", + " const double tmin = dop - sqrtD;\n", + " if (ray.m_tmin < tmin && tmin < ray.m_tmax) {\n", + " ray.m_tmax = tmin;\n", + " return true;\n", + " }\n", + "\n", + " const double tmax = dop + sqrtD;\n", + " if (ray.m_tmin < tmax && tmax < ray.m_tmax) {\n", + " ray.m_tmax = tmax;\n", + " return true;\n", + " }\n", + "\n", + " return false;\n", + " }\n", + "\n", + " double m_r;\n", + " Vector3 m_p; // position\n", + " Vector3 m_e; // emission\n", + " Vector3 m_f; // reflection\n", + " Reflection_t m_reflection_t;\n", + " };\n", + "\n", + " \n", + " const Sphere g_spheres[] = {\n", + " Sphere(1e5, Vector3(1e5 + 1, 40.8, 81.6), Vector3(), Vector3(0.75,0.25,0.25), Reflection_t::Diffuse), //Left\n", + " Sphere(1e5, Vector3(-1e5 + 99, 40.8, 81.6), Vector3(), Vector3(0.25,0.25,0.75), Reflection_t::Diffuse), //Right\n", + " Sphere(1e5, Vector3(50, 40.8, 1e5), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Back\n", + " Sphere(1e5, Vector3(50, 40.8, -1e5 + 170), Vector3(), Vector3(), Reflection_t::Diffuse), //Front\n", + " Sphere(1e5, Vector3(50, 1e5, 81.6), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Bottom\n", + " Sphere(1e5, Vector3(50, -1e5 + 81.6, 81.6), Vector3(), Vector3(0.75), Reflection_t::Diffuse), //Top\n", + " Sphere(16.5, Vector3(27, 16.5, 47), Vector3(), Vector3(0.999), Reflection_t::Specular), //Mirror\n", + " Sphere(16.5, Vector3(73, 16.5, 78), Vector3(), Vector3(0.999), Reflection_t::Refractive), //Glass\n", + " Sphere(600, Vector3(50, 681.6 - .27, 81.6), Vector3(12), Vector3(), Reflection_t::Diffuse) //Light\n", + " };\n", + "\n", + "\n", + " __device__ inline Vector3 UniformSampleOnHemisphere(double u1, double u2) {\n", + " // u1 := cos_theta\n", + " const double sin_theta = std::sqrt(fmax(0.0, 1.0 - u1 * u1));\n", + " const double phi = 2.0 * g_pi * u2;\n", + " return {\n", + " std::cos(phi) * sin_theta,\n", + " std::sin(phi) * sin_theta,\n", + " u1\n", + " };\n", + " }\n", + "\n", + " __device__ inline Vector3 CosineWeightedSampleOnHemisphere(double u1, double u2) {\n", + " const double cos_theta = sqrt(1.0 - u1);\n", + " const double sin_theta = sqrt(u1);\n", + " const double phi = 2.0 * g_pi * u2;\n", + " return {\n", + " std::cos(phi) * sin_theta,\n", + " std::sin(phi) * sin_theta,\n", + " cos_theta\n", + " };\n", + " }\n", + " \n", + " __device__ inline double Reflectance0(double n1, double n2) {\n", + " const double sqrt_R0 = (n1 - n2) / (n1 + n2);\n", + " return sqrt_R0 * sqrt_R0;\n", + " }\n", + "\n", + " __device__ inline double SchlickReflectance(double n1, \n", + " double n2, \n", + " double c) {\n", + " const double R0 = Reflectance0(n1, n2);\n", + " return R0 + (1.0 - R0) * c * c * c * c * c;\n", + " }\n", + "\n", + " __device__ inline const Vector3 IdealSpecularReflect(const Vector3& d, const Vector3& n) {\n", + " return d - 2.0 * n.Dot(d) * n;\n", + " }\n", + "\n", + " __device__ inline const Vector3 IdealSpecularTransmit(const Vector3& d, \n", + " const Vector3& n, \n", + " double n_out, \n", + " double n_in, \n", + " double& pr, \n", + " curandState* state) {\n", + " \n", + " const Vector3 d_Re = IdealSpecularReflect(d, n);\n", + "\n", + " const bool out_to_in = (0.0 > n.Dot(d));\n", + " const Vector3 nl = out_to_in ? n : -n;\n", + " const double nn = out_to_in ? n_out / n_in : n_in / n_out;\n", + " const double cos_theta = d.Dot(nl);\n", + " const double cos2_phi = 1.0 - nn * nn * (1.0 - cos_theta * cos_theta);\n", + "\n", + " // Total Internal Reflection\n", + " if (0.0 > cos2_phi) {\n", + " pr = 1.0;\n", + " return d_Re;\n", + " }\n", + "\n", + " const Vector3 d_Tr = Normalize(nn * d - nl * (nn * cos_theta + sqrt(cos2_phi)));\n", + " const double c = 1.0 - (out_to_in ? -cos_theta : d_Tr.Dot(n));\n", + "\n", + " const double Re = SchlickReflectance(n_out, n_in, c);\n", + " const double p_Re = 0.25 + 0.5 * Re;\n", + " if (curand_uniform_double(state) < p_Re) {\n", + " pr = (Re / p_Re);\n", + " return d_Re;\n", + " }\n", + " else {\n", + " const double Tr = 1.0 - Re;\n", + " const double p_Tr = 1.0 - p_Re;\n", + " pr = (Tr / p_Tr);\n", + " return d_Tr;\n", + " }\n", + " }\n", + "\n", + " __device__ inline bool Intersect(const Sphere* dev_spheres, \n", + " size_t nb_spheres, \n", + " const Ray& ray, \n", + " size_t& id)\n", + " {\n", + " bool hit = false;\n", + " for (size_t i = 0u; i < nb_spheres; ++i) {\n", + " if (dev_spheres[i].Intersect(ray)) {\n", + " hit = true;\n", + " id = i;\n", + " }\n", + " }\n", + "\n", + " return hit;\n", + " }\n", + "\n", + "//__device__ static Vector3 Radiance(const Sphere* dev_spheres, \n", + " __device__ Vector3 Radiance(const Sphere* dev_spheres, \n", + " size_t nb_spheres,\n", + " const Ray& ray, \n", + " curandState* state)\n", + " { \n", + " Ray r = ray;\n", + " Vector3 L;\n", + " Vector3 F(1.0);\n", + "\n", + " while (true) {\n", + " size_t id;\n", + " if (!Intersect(dev_spheres, nb_spheres, r, id)) {\n", + " return L;\n", + " }\n", + "\n", + " const Sphere& shape = dev_spheres[id];\n", + " const Vector3 p = r(r.m_tmax);\n", + " const Vector3 n = Normalize(p - shape.m_p);\n", + "\n", + " L += F * shape.m_e;\n", + " F *= shape.m_f;\n", + "\n", + " // Russian roulette\n", + " if (4 < r.m_depth) {\n", + " const double continue_probability = shape.m_f.Max();\n", + " if (curand_uniform_double(state) >= continue_probability) {\n", + " return L;\n", + " }\n", + " F /= continue_probability;\n", + " }\n", + "\n", + " // Next path segment\n", + " switch (shape.m_reflection_t) {\n", + " \n", + " case Reflection_t::Specular: {\n", + " const Vector3 d = IdealSpecularReflect(r.m_d, n);\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " break;\n", + " }\n", + " \n", + " case Reflection_t::Refractive: {\n", + " double pr;\n", + " const Vector3 d = IdealSpecularTransmit(r.m_d, n, REFRACTIVE_INDEX_OUT, REFRACTIVE_INDEX_IN, pr, state);\n", + " F *= pr;\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " break;\n", + " }\n", + " \n", + " default: {\n", + " const Vector3 w = (0.0 > n.Dot(r.m_d)) ? n : -n;\n", + " const Vector3 u = Normalize((abs(w.m_x) > 0.1 ? Vector3(0.0, 1.0, 0.0) : Vector3(1.0, 0.0, 0.0)).Cross(w));\n", + " const Vector3 v = w.Cross(u);\n", + "\n", + " const Vector3 sample_d = CosineWeightedSampleOnHemisphere(curand_uniform_double(state), curand_uniform_double(state));\n", + " const Vector3 d = Normalize(sample_d.m_x * u + sample_d.m_y * v + sample_d.m_z * w);\n", + " r = Ray(p, d, EPSILON_SPHERE, INFINITY, r.m_depth + 1u);\n", + " }\n", + " }\n", + " }\n", + " }\n", + "\n", + "// __global__ static void kernel(const Sphere* dev_spheres, \n", + " __global__ void kernel(const Sphere* dev_spheres, \n", + " size_t nb_spheres,\n", + " std::uint32_t w, \n", + " std::uint32_t h, \n", + " Vector3* Ls, \n", + " std::uint32_t nb_samples)\n", + " {\n", + " \n", + " printf(\"!!!\\n\");\n", + "\n", + " const std::uint32_t x = threadIdx.x + blockIdx.x * blockDim.x;\n", + " const std::uint32_t y = threadIdx.y + blockIdx.y * blockDim.y;\n", + " const std::uint32_t offset = x + y * blockDim.x * gridDim.x;\n", + "\n", + " if (x >= w || y >= h) {\n", + " return;\n", + " }\n", + "\n", + " // RNG\n", + " curandState state;\n", + " curand_init(offset, 0u, 0u, &state);\n", + "\n", + " const Vector3 eye = { 50.0, 52.0, 295.6 };\n", + " const Vector3 gaze = Normalize(Vector3(0.0, -0.042612, -1.0));\n", + " const double fov = 0.5135;\n", + " const Vector3 cx = { w * fov / h, 0.0, 0.0 };\n", + " const Vector3 cy = Normalize(cx.Cross(gaze)) * fov;\n", + "\n", + " for (size_t sy = 0u, i = (h - 1u - y) * w + x; sy < 2u; ++sy) { // 2 subpixel row\n", + "\n", + " for (size_t sx = 0u; sx < 2u; ++sx) { // 2 subpixel column\n", + "\n", + " Vector3 L;\n", + "\n", + " for (size_t s = 0u; s < nb_samples; ++s) { // samples per subpixel\n", + " const double u1 = 2.0 * curand_uniform_double(&state);\n", + " const double u2 = 2.0 * curand_uniform_double(&state);\n", + " const double dx = (u1 < 1.0) ? sqrt(u1) - 1.0 : 1.0 - sqrt(2.0 - u1);\n", + " const double dy = (u2 < 1.0) ? sqrt(u2) - 1.0 : 1.0 - sqrt(2.0 - u2);\n", + " const Vector3 d = cx * (((sx + 0.5 + dx) * 0.5 + x) / w - 0.5) +\n", + " cy * (((sy + 0.5 + dy) * 0.5 + y) / h - 0.5) + gaze;\n", + " \n", + " L += Radiance(dev_spheres, nb_spheres, \n", + " Ray(eye + d * 130, Normalize(d), EPSILON_SPHERE), &state) \n", + " * (1.0 / nb_samples);\n", + " }\n", + " \n", + " Ls[i] += 0.25 * Clamp(L);\n", + " }\n", + " }\n", + " }\n", + "\n", + "// static void Render(std::uint32_t nb_samples) noexcept\n", + " void Render(std::uint32_t nb_samples) noexcept\n", + " {\n", + " const std::uint32_t w = 256u; //1024u;\n", + " const std::uint32_t h = 192u; //768u;\n", + " const std::uint32_t nb_pixels = w * h;\n", + "\n", + " // Set up device memory\n", + " Sphere* dev_spheres;\n", + " CUDA_SAFE_CALL(cudaMalloc((void**)&dev_spheres, sizeof(g_spheres)));\n", + " CUDA_SAFE_CALL(cudaMemcpy(dev_spheres, g_spheres, sizeof(g_spheres), cudaMemcpyHostToDevice));\n", + " Vector3* dev_Ls;\n", + " CUDA_SAFE_CALL(cudaMalloc((void**)&dev_Ls, nb_pixels * sizeof(Vector3)));\n", + " CUDA_SAFE_CALL(cudaMemset(dev_Ls, 0, nb_pixels * sizeof(Vector3)));\n", + "\n", + " // Kernel execution\n", + " const dim3 nblocks(w / 16u, h / 16u);\n", + " const dim3 nthreads(16u, 16u);\n", + "\n", + "printf(\"@@@render kernel@@@\\n\");\n", + " kernel<<>>(dev_spheres, sizeof(g_spheres)/sizeof(g_spheres[0]), w, h, dev_Ls, nb_samples);\n", + " CUDA_SAFE_CALL(cudaPeekAtLastError());\n", + " CUDA_SAFE_CALL(cudaDeviceSynchronize());\n", + " \n", + " // Set up host memory\n", + " Vector3* Ls = (Vector3*)malloc(nb_pixels * sizeof(Vector3));\n", + " // Transfer device -> host\n", + " CUDA_SAFE_CALL(cudaMemcpy(Ls, dev_Ls, nb_pixels * sizeof(Vector3), cudaMemcpyDeviceToHost));\n", + "\n", + " // Clean up device memory\n", + " CUDA_SAFE_CALL(cudaFree(dev_Ls));\n", + " CUDA_SAFE_CALL(cudaFree(dev_spheres));\n", + "\n", + " WritePPM(w, h, Ls);\n", + "\n", + " // Clean up host memory\n", + " free(Ls);\n", + " }\n", + "\n", + "//const std::uint32_t nb_samples = 100;\n", + "//smallpt::\n", + "//Render(100);\n", + "//printf(\"Done Render\\n\");" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c70fc5e6-4f6b-4d4d-9cdb-6fa91cb1b09e", + "metadata": {}, + "outputs": [], + "source": [ + "void devicePropertyPrint()\n", + "{\n", + " int dev = 0;\n", + " cudaDeviceProp devProp;\n", + " if(cudaGetDeviceProperties(&devProp, dev) == cudaSuccess)\n", + " {\n", + " printf(\"Device %i, named: %s\\n\", dev, devProp.name);\n", + " printf(\"Device compute capability: %i.%i\\n\", devProp.major, devProp.minor);\n", + " printf(\"Device maxThreadDim: [%i, %i, %i]\\n\", devProp.maxThreadsDim[0], devProp.maxThreadsDim[1], devProp.maxThreadsDim[2]);\n", + " printf(\"Device maxGridSize: [%i, %i, %i]\\n\", devProp.maxGridSize[0], devProp.maxGridSize[1], devProp.maxGridSize[2]);\n", + " printf(\"Multi Processor Count: %i\\n\", devProp.multiProcessorCount);\n", + " printf(\"Size of SharedMem Per-Block: %f KB\\n\", devProp.sharedMemPerBlock / 1024.0);\n", + " printf(\"Max Threads Per-Block: %i\\n\", devProp.maxThreadsPerBlock);\n", + " printf(\"Max Threads Per-MultiProcessor: %i\\n\", devProp.maxThreadsPerMultiProcessor);\n", + " printf(\"\\n\");\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "601b07fc-dc2b-4d75-90e1-2b6279d77a1d", + "metadata": { + "tags": [] + }, + "source": [ + "## CUDA kernel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d41025be-e3a0-4de4-bfa8-51c2eb7dc4f8", + "metadata": {}, + "outputs": [], + "source": [ + "__global__ void vectorAdd(const float *A, const float *B, float *C, int numElements)\n", + "{ \n", + " printf(\"###\\n\");\n", + "\n", + " int i = blockDim.x * blockIdx.x + threadIdx.x;\n", + "\n", + " if (i < numElements) {\n", + " C[i] = A[i] + B[i] + 0.0f;\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "c178646f-0f60-4da9-988e-9019632f4334", + "metadata": { + "tags": [] + }, + "source": [ + "## Host code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01808b34-0e17-428e-b5e6-2e5fd3cdb1c4", + "metadata": {}, + "outputs": [], + "source": [ + "int main(void) {\n", + "\n", + " // Error code to check return values for CUDA calls\n", + " cudaError_t err = cudaSuccess;\n", + "\n", + " // Print the vector length to be used, and compute its size\n", + " int numElements = 50000;\n", + " size_t size = numElements * sizeof(float);\n", + " printf(\"[Vector addition of %d elements]\\n\", numElements);\n", + "\n", + " // Allocate the host input vector A\n", + " float *h_A = (float *)malloc(size);\n", + "\n", + " // Allocate the host input vector B\n", + " float *h_B = (float *)malloc(size);\n", + "\n", + " // Allocate the host output vector C\n", + " float *h_C = (float *)malloc(size);\n", + "\n", + " // Verify that allocations succeeded\n", + " if (h_A == NULL || h_B == NULL || h_C == NULL) {\n", + " printf(\"Failed to allocate host vectors!\\n\");\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Initialize the host input vectors\n", + " for (int i = 0; i < numElements; ++i) {\n", + " h_A[i] = rand() / (float)RAND_MAX;\n", + " h_B[i] = rand() / (float)RAND_MAX;\n", + " }\n", + "\n", + " // Allocate the device input vector A\n", + " float *d_A = NULL;\n", + " err = cudaMalloc((void **)&d_A, size);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to allocate device vector A (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Allocate the device input vector B\n", + " float *d_B = NULL;\n", + " err = cudaMalloc((void **)&d_B, size);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to allocate device vector B (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Allocate the device output vector C\n", + " float *d_C = NULL;\n", + " err = cudaMalloc((void **)&d_C, size);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to allocate device vector C (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Copy the host input vectors A and B in host memory to the device input\n", + " // vectors in\n", + " // device memory\n", + " printf(\"Copy input data from the host memory to the CUDA device\\n\");\n", + " err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\n", + " \"Failed to copy vector A from host to device (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\n", + " \"Failed to copy vector B from host to device (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Launch the Vector Add CUDA Kernel\n", + " int threadsPerBlock = 256;\n", + " int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;\n", + " printf(\"CUDA kernel launch with %d blocks of %d threads\\n\", blocksPerGrid,\n", + " threadsPerBlock);\n", + "printf(\"@@@add kernel@@@\\n\");\n", + " vectorAdd<<>>(d_A, d_B, d_C, numElements);\n", + " err = cudaGetLastError();\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to launch vectorAdd kernel (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Copy the device result vector in device memory to the host result vector\n", + " // in host memory.\n", + " printf(\"Copy output data from the CUDA device to the host memory\\n\");\n", + " err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\n", + " \"Failed to copy vector C from device to host (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Verify that the result vector is correct\n", + " for (int i = 0; i < numElements; ++i) {\n", + " if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) {\n", + " printf(\"Result verification failed at element %d! A=%f B=%f C=%f\\n\", i, h_A[i], h_B[i], h_C[i]);\n", + " return 1;\n", + " exit(EXIT_FAILURE);\n", + " }\n", + " }\n", + "\n", + " printf(\"Test PASSED\\n\");\n", + "\n", + " // Free device global memory\n", + " err = cudaFree(d_A);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to free device vector A (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " err = cudaFree(d_B);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to free device vector B (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " err = cudaFree(d_C);\n", + "\n", + " if (err != cudaSuccess) {\n", + " printf(\"Failed to free device vector C (error code %s)!\\n\",\n", + " cudaGetErrorString(err));\n", + " exit(EXIT_FAILURE);\n", + " }\n", + "\n", + " // Free host memory\n", + " free(h_A);\n", + " free(h_B);\n", + " free(h_C);\n", + "\n", + " printf(\"Done Add\\n\");\n", + " return 0;\n", + "}\n", + "\n", + "///\n", + "Vector3 v;\n", + "///\n", + "\n", + "devicePropertyPrint();\n", + "main();\n", + "main();\n", + "\n", + "Render(100);\n", + "printf(\"Done Render\\n\");" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "CUDA (C++17)", + "language": "CUDA", + "name": "cuda-xcpp17" + }, + "language_info": { + "codemirror_mode": "text/x-c++src", + "file_extension": ".cpp", + "mimetype": "text/x-c++src", + "name": "c++", + "version": "17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}