Skip to content

Commit

Permalink
Use StaticArrays to compute histogram (#94)
Browse files Browse the repository at this point in the history
  • Loading branch information
tkf authored Jan 30, 2022
1 parent 83d6fab commit 4cbd945
Show file tree
Hide file tree
Showing 12 changed files with 105 additions and 19 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ _jlpkg-test/environments/jl17:
$(JULIA17) $(JUSLIA_CMD) -e 'using Pkg; Pkg.$(JLPKG_COMMAND)()' --project=test/environments/jl17

_jlpkg-docs:
$(JULIA) $(JUSLIA_CMD) -e 'using Pkg; Pkg.$(JLPKG_COMMAND)()' --project=docs
$(JULIA16) $(JUSLIA_CMD) -e 'using Pkg; Pkg.$(JLPKG_COMMAND)()' --project=docs
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ UnionArrays = "d6dd79e4-993b-11e9-1366-0de1c9fe1122"
[compat]
CUDA = "3.8"
InitialValues = "0.2, 0.3"
Transducers = "0.4.63"
Transducers = "0.4.69"
UnionArrays = "0.1.2"
julia = "1.6"
6 changes: 3 additions & 3 deletions docs/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ version = "0.2.6"
deps = ["CUDA", "InitialValues", "Transducers", "UnionArrays"]
path = ".."
uuid = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
version = "0.1.8-DEV"
version = "0.1.9-DEV"

[[Future]]
deps = ["Random"]
Expand Down Expand Up @@ -490,9 +490,9 @@ version = "0.5.15"

[[Transducers]]
deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"]
git-tree-sha1 = "3f0945b47207a41946baee6d1385e4ca738c25f7"
git-tree-sha1 = "a34f53c9e14d131b0ce114f591d3c5d428431ba0"
uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999"
version = "0.4.68"
version = "0.4.69"

[[UUIDs]]
deps = ["Random", "SHA"]
Expand Down
40 changes: 40 additions & 0 deletions examples/histograms.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using CUDA
using Folds
using FoldsCUDA
using StaticArrays
using Transducers
using Transducers: whencompletebasecase

# TODO: pretty FLoops syntax
function countints_svector_functional(indices, ::Val{n}; ex = PreferParallel()) where {n}

function init()
# On initialization, create a `MVector` as a basecase-local
# sub-histogram buffer `b`:
zero(MVector{n,Int})
end

function inc!(b, i)
@inbounds b[max(begin, min(i, end))] += 1
b
end

function completebasecase(b)
# After basecase computing is done, convert the buffer `b` to an
# immutable value `SVector` to share the value across threads:
SVector(b)
end

function combine(h, b)
# Cross-thread reduction is simply point-wise addition:
h .+ b
end

rf =
inc! |>
wheninit(init) |>
whencompletebasecase(completebasecase) |>
whencombine(combine)

Folds.reduce(rf, indices, ex)
end
3 changes: 2 additions & 1 deletion src/FoldsCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ using Transducers:
IdentityTransducer,
Reduction,
_reducingfunction,
completebasecase,
extract_transducer,
foldl_nocomplete
foldl_basecase

include("utils.jl")
include("kernels.jl")
Expand Down
1 change: 1 addition & 0 deletions src/introspection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ function Base.showerror(io::IO, err::FailedInference)
println(io)
print(io, "Note: on the host, the return type is inferred as ", err.host_return)
end
println(io)
printstyled(io, "HINT"; bold = true, color = :light_black)
printstyled(
io,
Expand Down
21 changes: 16 additions & 5 deletions src/kernels.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,21 @@ const _TRUE_ = Ref(true)
function fake_transduce(rf, xs, init, ::Val{IncludeInit} = Val(false)) where {IncludeInit}
if IncludeInit
if _TRUE_[]
return start(rf, init)
return completebasecase(rf, start(rf, init))
end
end
if _TRUE_[]
acc1 = next(rf, start(rf, init), first(xs))
for x in xs
acc1 = next(rf, acc1, x)
end
return acc1
return completebasecase(rf, acc1)
else
return _combine(rf, fake_transduce(rf, xs, init), fake_transduce(rf, xs, init))
acc1 = fake_transduce(rf, xs, init)
acc2 = fake_transduce(rf, xs, init)
acc3 = _combine(rf, acc1, acc2)
acc4 = completebasecase(rf, acc3)
return acc4
end
end

Expand Down Expand Up @@ -98,7 +102,7 @@ function _infer_acctype(rf, init, arrays, include_init::Bool = false)
)
fake_args_tt = Tuple{map(Typeof, fake_args)...}
acctype = CUDA.return_type(fake_transduce, fake_args_tt)
if acctype === Union{}
if acctype === Union{} || !Base.datatype_pointerfree(Some{acctype})
host_args = (rf, zip(arrays...), init)
acctype_host = Core.Compiler.return_type(fake_transduce, Tuple{map(Typeof, host_args)...})
if RUN_ON_HOST_IF_NORETURN[] && acctype_host === Union{}
Expand Down Expand Up @@ -194,6 +198,13 @@ function _transduce!(buf, rf::F, init, arrays...) where {F}
return dest, buf
end

# Since CUDA already requires that everything is inlined, `restack` is not
# useful. Instead, it's better to avoid introducing extra function calls to
# reduce the change that inliner gives up.
@static if isdefined(Transducers, :restack) && isdefined(CUDA, Symbol("@device_override"))
CUDA.@device_override Transducers.restack(x) = x
end

function transduce_kernel!(
dest::Union{AbstractArray,Nothing},
rf::F,
Expand All @@ -216,7 +227,7 @@ function transduce_kernel!(
x1 = @inbounds getvalues(idx[i1], arrays...)
@inline getinput(i) = @inbounds getvalues(idx[i], arrays...)
xf = Map(getinput)
acc = foldl_nocomplete(
acc = foldl_basecase(
Reduction(xf, rf),
next(rf, start(rf, init), x1),
offset*basesize+2:min((offset + 1) * basesize, n),
Expand Down
1 change: 1 addition & 0 deletions test/FoldsCUDATests/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Random123 = "74087812-796a-5b5d-8853-05524746bad3"
Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SplittablesBase = "171d559e-b47b-412a-8079-5efa626c420e"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestFunctionRunner = "792026f5-ac9a-4a19-adcb-47b0ce2deb5d"
Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999"
Expand Down
19 changes: 19 additions & 0 deletions test/FoldsCUDATests/src/test_histograms.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
module TestHistograms

include("../../../examples/histograms.jl")
using Test

function test_one_to_ten()
indices = CuVector(1:10)
h = countints_svector_functional(indices, Val(10))
@test collect(h) == fill(1, 10)
end

function test_rand()
indices = (floor(Int, x * 10) + 1 for x in CUDA.rand(2^30))
h = countints_svector_functional(indices, Val(10))
h = collect(h)
@test h ./ h[1] fill(1, 10) atol = 0.01
end

end # module
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Random123 = "74087812-796a-5b5d-8853-05524746bad3"
Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SplittablesBase = "171d559e-b47b-412a-8079-5efa626c420e"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TestFunctionRunner = "792026f5-ac9a-4a19-adcb-47b0ce2deb5d"
Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999"
Expand Down
14 changes: 10 additions & 4 deletions test/environments/jl16/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ version = "0.2.6"
deps = ["CUDA", "InitialValues", "Transducers", "UnionArrays"]
path = "../../.."
uuid = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
version = "0.1.8-DEV"
version = "0.1.9-DEV"

[[FoldsCUDABenchmarks]]
deps = ["BenchmarkTools", "CUDA", "Folds", "FoldsCUDA"]
Expand All @@ -195,7 +195,7 @@ uuid = "1457febb-a09b-4652-98c9-46b8ccd8ff53"
version = "0.1.0"

[[FoldsCUDATests]]
deps = ["Adapt", "Aqua", "CUDA", "Documenter", "FLoops", "Folds", "FoldsCUDA", "FoldsCUDABenchmarks", "GPUArrays", "InitialValues", "LiterateTest", "Random", "Random123", "Referenceables", "Setfield", "SplittablesBase", "Test", "TestFunctionRunner", "Transducers"]
deps = ["Adapt", "Aqua", "CUDA", "Documenter", "FLoops", "Folds", "FoldsCUDA", "FoldsCUDABenchmarks", "GPUArrays", "InitialValues", "LiterateTest", "Random", "Random123", "Referenceables", "Setfield", "SplittablesBase", "StaticArrays", "Test", "TestFunctionRunner", "Transducers"]
path = "../../FoldsCUDATests"
uuid = "d11caea5-3c98-4cd5-8a56-9589fe6662ee"
version = "0.1.0"
Expand Down Expand Up @@ -476,6 +476,12 @@ git-tree-sha1 = "39c9f91521de844bad65049efd4f9223e7ed43f9"
uuid = "171d559e-b47b-412a-8079-5efa626c420e"
version = "0.1.14"

[[StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "2884859916598f974858ff01df7dfc6c708dd895"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.3.3"

[[Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down Expand Up @@ -523,9 +529,9 @@ version = "0.5.15"

[[Transducers]]
deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"]
git-tree-sha1 = "3f0945b47207a41946baee6d1385e4ca738c25f7"
git-tree-sha1 = "a34f53c9e14d131b0ce114f591d3c5d428431ba0"
uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999"
version = "0.4.68"
version = "0.4.69"

[[UUIDs]]
deps = ["Random", "SHA"]
Expand Down
14 changes: 10 additions & 4 deletions test/environments/jl17/Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ version = "0.2.6"
deps = ["CUDA", "InitialValues", "Transducers", "UnionArrays"]
path = "../../.."
uuid = "6cd66ae4-5932-4b96-926d-e73e578e42cc"
version = "0.1.8-DEV"
version = "0.1.9-DEV"

[[deps.FoldsCUDABenchmarks]]
deps = ["BenchmarkTools", "CUDA", "Folds", "FoldsCUDA"]
Expand All @@ -198,7 +198,7 @@ uuid = "1457febb-a09b-4652-98c9-46b8ccd8ff53"
version = "0.1.0"

[[deps.FoldsCUDATests]]
deps = ["Adapt", "Aqua", "CUDA", "Documenter", "FLoops", "Folds", "FoldsCUDA", "FoldsCUDABenchmarks", "GPUArrays", "InitialValues", "LiterateTest", "Random", "Random123", "Referenceables", "Setfield", "SplittablesBase", "Test", "TestFunctionRunner", "Transducers"]
deps = ["Adapt", "Aqua", "CUDA", "Documenter", "FLoops", "Folds", "FoldsCUDA", "FoldsCUDABenchmarks", "GPUArrays", "InitialValues", "LiterateTest", "Random", "Random123", "Referenceables", "Setfield", "SplittablesBase", "StaticArrays", "Test", "TestFunctionRunner", "Transducers"]
path = "../../FoldsCUDATests"
uuid = "d11caea5-3c98-4cd5-8a56-9589fe6662ee"
version = "0.1.0"
Expand Down Expand Up @@ -483,6 +483,12 @@ git-tree-sha1 = "39c9f91521de844bad65049efd4f9223e7ed43f9"
uuid = "171d559e-b47b-412a-8079-5efa626c420e"
version = "0.1.14"

[[deps.StaticArrays]]
deps = ["LinearAlgebra", "Random", "Statistics"]
git-tree-sha1 = "2884859916598f974858ff01df7dfc6c708dd895"
uuid = "90137ffa-7385-5640-81b9-e52037218182"
version = "1.3.3"

[[deps.Statistics]]
deps = ["LinearAlgebra", "SparseArrays"]
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down Expand Up @@ -530,9 +536,9 @@ version = "0.5.15"

[[deps.Transducers]]
deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"]
git-tree-sha1 = "3f0945b47207a41946baee6d1385e4ca738c25f7"
git-tree-sha1 = "a34f53c9e14d131b0ce114f591d3c5d428431ba0"
uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999"
version = "0.4.68"
version = "0.4.69"

[[deps.UUIDs]]
deps = ["Random", "SHA"]
Expand Down

0 comments on commit 4cbd945

Please sign in to comment.